Commit 17dc32e3 by lzzzzl

Merge branch 'master' of ssh://119.23.72.7:22611/lic/bom_identify into dev

# Conflicts:
#	utils/excel_manager.py
parents 76ff0b71 efc76efc
......@@ -18,8 +18,9 @@ class Classify(classify_pb2_grpc.classifyServicer):
def Classify(self, request, context):
print('接收数据: ' + request.keyword)
res = self.predictorfac.predict(request.keyword)
dic_data = json.loads(request.keyword)
print(dic_data)
res = self.predictorfac.predict(dic_data)
return classify_pb2.ClassifyReply(message='result {msg}'.format(msg=res))
def fac_test_predic(self, data):
......@@ -45,4 +46,7 @@ if __name__ == '__main__':
# data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单')
# print(data)
# print(Classify().fac_test_predic(data))
for i in range(65, 91):
print(chr(i))
......@@ -4,7 +4,8 @@ from protobuf import classify_pb2_grpc
import pandas as pd
import json
def get_test_data():
df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None)
df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx', header=None)
print(df)
df.fillna(' ', inplace=True)
dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft)
......
No preview for this file type
No preview for this file type
......@@ -19,10 +19,9 @@ class PredictorFac():
elif isinstance(data, dict):
return self.dict_predictor
def predict(self, data):
dic_data = json.loads(data)
def predict(self, dic_data, predict_type='all'):
predictor = self._get_predictor(dic_data)
res = predictor.predict(dic_data)
res = predictor.predict(dic_data, predict_type)
return res
......@@ -30,8 +29,8 @@ if __name__ == "__main__":
def get_test_data():
import pandas as pd
import json
df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx', header=None)
df.fillna(' ', inplace=True)
df = pd.read_excel(r'C:\data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='1')
df.fillna('?', inplace=True)
dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft)
......@@ -40,7 +39,8 @@ if __name__ == "__main__":
data = get_test_data()
p = PredictorFac(model_config)
res = p.predict(data)
data = json.loads(data)
res = p.predict(data,predict_type='model')
print(res)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
from utils.log_manager import get_logger
from utils.robots import dd_send_msg
import pandas as pd
NUMBER_LEVEL = 0.7
SEQ_LEVEL = 0.5
class BasePredictor:
'''
预测类基类
......@@ -20,6 +22,7 @@ class BasePredictor:
self.robot_msg = dd_send_msg
self.pd = pd
def predict(self, key):
raise NotImplementedError
......@@ -37,3 +40,47 @@ class BasePredictor:
deal_list = [round(i, 3) for i in proba[0].tolist()]
dic_proba = {k: v for k, v in zip(classes, deal_list)}
return predictions[0], dic_proba
def isseq(self, data):
"""
序号列预测
"""
collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
if len(collect_seq) < 1 or not self.isIncrease(collect_seq, len(collect_seq)):
return False
else:
rate = round(len(collect_seq) / len(data), 3)
return True if rate >= SEQ_LEVEL else False
def isnum(self, data):
"""
数量列预测
"""
collect_num = [kw for kw in data if isinstance(kw, int) or self.isNumberCol(kw)]
rate = round(len(collect_num) / len(data), 3)
return True if rate >= NUMBER_LEVEL else False
def isIncrease(self, arr, size):
"""
判断列表元素是否递增
"""
if size == 1:
return True
return (arr[size - 1] >= arr[size - 2]) and self.isIncrease(arr, size - 1)
def isNumberCol(self, kw):
"""
是否是数量列
"""
if isinstance(kw, str):
return re.match(r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
else:
return False
def valid_seq(self, data):
"""取反"""
return not self.isseq(data)
def valid_num(self, data):
return not self.isnum(data)
\ No newline at end of file
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from collections import Counter
from predict.base_handler import BasePredictor
# 可能的头部字段
prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
"需求数量",
"需求数量", "售价",
"参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]
# 标准名和代名词的映射
fields_map = {"序号": ["序号"],
"类别": ["类别", "分类", "名称", "类别名称"],
"参数": ["参数", "规格", "描述"],
"型号": ["型号", "参考料号", "料号", "MPN"],
"型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"],
"数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
"封装": ["封装", "封装规格"],
"品牌": ["品牌", "品牌/厂商", "参考品牌", "厂商编码", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
"品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
# 必须返回也必须验证的标准字段
MUST_STD_FIELDS = ['参数', '数量']
#
order_list = ['序号']
en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}
def fun(seri):
li_seri = seri.tolist()
......@@ -28,14 +32,18 @@ def fun(seri):
# 取前多少行
HEAD_ROW = 5
HEAD_ROW = 7
# 空置率阈值
NAN_RATE = 0.8
# 占位符
PLACEHOLDER = '?'
class DicPredict(BasePredictor):
def id_by_field(self, df_head):
"""
:param df_head: 传入接收数据的头部Dataframe(默认5行)
:param df_head: 传入接收数据的头部Dataframe(默认7行)
:return li_res: 返回结果列表
"""
series = df_head.apply(fun)
......@@ -60,14 +68,6 @@ class DicPredict(BasePredictor):
columns = []
li_data = []
for k, v in dict_data.items():
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if not str(item).strip():
counter += 1
if counter / len(v) >= 0.8:
continue
columns.append(k)
li_data.append(v)
......@@ -84,31 +84,89 @@ class DicPredict(BasePredictor):
else:
dic_ab_res = {i['column_name']: i['pronoun']}
ab_result.append(dic_ab_res)
id_res = {
pre_id_res = {
'std_result': std_result,
'ab_result': ab_result,
}
return id_res
def predict(self, dic_data):
res = self.pre_predict(dic_data)
if res:
return res
if len(dic_data) > 0:
self.order_predict(dic_data[0])
pass
def order_predict(self, data):
collect_num = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
judge = self.IsIncrease(collect_num, len(collect_num))
print('judge: ' + str(judge))
return judge
"""
判断列表元素是否递增
"""
def IsIncrease(self, arr, size):
if size == 1:
return pre_id_res
def model_predict(self, dic_data):
"""
该方法目前只对[参数, 型号, 数量, 品牌]进行预测
:param dic_data:
:return:
"""
print(dic_data)
prob_columns = []
temp_pre_model_res = {}
for k, v in dic_data.items():
bol = self.v_chain(v)
if bol:
print(k, bol)
prob_columns.append(k)
continue
if self.isnum(v):
temp_pre_model_res[k] = '数量'
continue
if self.isseq(v):
temp_pre_model_res[k] = '序号'
temp_dic_data = {k: list(filter(lambda x: x != PLACEHOLDER, dic_data[k]))for k in prob_columns}
for k, v in temp_dic_data.items():
li_single_pred_res = []
for string in v:
single_pred_res, probdic = self.get_single_predict(string)
li_single_pred_res.append(single_pred_res)
result = Counter(li_single_pred_res)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(k, li_sort)
temp_pre_model_res[k] = en_to_zh_map[li_sort[0][0]]
print(temp_pre_model_res)
pre_model_res = {}
def predict(self, dic_data, predict_type='all'):
dic_data = self.pre_deal(dic_data)
if predict_type == 'all':
pre_id_res = self.pre_predict(dic_data)
if pre_id_res:
return pre_id_res
elif predict_type == 'model':
model_id_res = self.model_predict(dic_data)
if model_id_res:
return model_id_res
elif predict_type == 'pre':
pre_id_res = self.pre_predict(dic_data)
if pre_id_res:
return pre_id_res
def pre_deal(self, dic_data):
new_dic_data = {}
for k, v in dic_data.items():
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if str(item).strip() == PLACEHOLDER:
counter += 1
if counter / len(v) <= NAN_RATE:
new_dic_data[k] = v
return new_dic_data
def v_chain(self, li):
"""
验证链,验证方法中某个环节返回了False则返回False
:param li:
:return:
"""
for fun_name in dir(self):
if fun_name.startswith('valid_'):
fun = getattr(self, fun_name)
if not fun(li):
return False
else:
return True
return (arr[size - 1] >= arr[size - 2]) and self.IsIncrease(arr, size - 1)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
# def read_from_excel(file_name, sheet_name):
# wb = xlrd.open_workbook(file_name)
# sheet = wb.sheet_by_name(sheet_name)
# row = sheet.nrows
# col = sheet.ncols
# result_dict = {}
# for i in range(col):
# col_list = []
# for j in range(row): col_list.append(sheet.cell_value(j, i))
# result_dict[i] = col_list
# return result_dict
def read_from_excel(file_name, sheet_name):
data = pd.read_excel(file_name, sheet_name=sheet_name, keep_default_na=False, header=-1)
result_dict = {}
count = 0
for index in data.columns:
result_dict[count] = data[index].tolist()
count += 1
return result_dict
l=[' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
for v in l:
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if str(item).strip() == PLACEHOLDER:
counter += 1
if counter / len(v) <= NAN_RATE:
new_dic_data[k] = v
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment