Commit 17dc32e3 by lzzzzl

Merge branch 'master' of ssh://119.23.72.7:22611/lic/bom_identify into dev

# Conflicts:
#	utils/excel_manager.py
parents 76ff0b71 efc76efc
...@@ -18,8 +18,9 @@ class Classify(classify_pb2_grpc.classifyServicer): ...@@ -18,8 +18,9 @@ class Classify(classify_pb2_grpc.classifyServicer):
def Classify(self, request, context): def Classify(self, request, context):
print('接收数据: ' + request.keyword) dic_data = json.loads(request.keyword)
res = self.predictorfac.predict(request.keyword) print(dic_data)
res = self.predictorfac.predict(dic_data)
return classify_pb2.ClassifyReply(message='result {msg}'.format(msg=res)) return classify_pb2.ClassifyReply(message='result {msg}'.format(msg=res))
def fac_test_predic(self, data): def fac_test_predic(self, data):
...@@ -45,4 +46,7 @@ if __name__ == '__main__': ...@@ -45,4 +46,7 @@ if __name__ == '__main__':
# data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单') # data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单')
# print(data) # print(data)
# print(Classify().fac_test_predic(data)) # print(Classify().fac_test_predic(data))
for i in range(65, 91):
print(chr(i))
...@@ -4,7 +4,8 @@ from protobuf import classify_pb2_grpc ...@@ -4,7 +4,8 @@ from protobuf import classify_pb2_grpc
import pandas as pd import pandas as pd
import json import json
def get_test_data(): def get_test_data():
df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None) df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx', header=None)
print(df)
df.fillna(' ', inplace=True) df.fillna(' ', inplace=True)
dic_dft = df.to_dict(orient='list') dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft) return json.dumps(dic_dft)
......
No preview for this file type
No preview for this file type
...@@ -19,10 +19,9 @@ class PredictorFac(): ...@@ -19,10 +19,9 @@ class PredictorFac():
elif isinstance(data, dict): elif isinstance(data, dict):
return self.dict_predictor return self.dict_predictor
def predict(self, data): def predict(self, dic_data, predict_type='all'):
dic_data = json.loads(data)
predictor = self._get_predictor(dic_data) predictor = self._get_predictor(dic_data)
res = predictor.predict(dic_data) res = predictor.predict(dic_data, predict_type)
return res return res
...@@ -30,8 +29,8 @@ if __name__ == "__main__": ...@@ -30,8 +29,8 @@ if __name__ == "__main__":
def get_test_data(): def get_test_data():
import pandas as pd import pandas as pd
import json import json
df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx', header=None) df = pd.read_excel(r'C:\data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='1')
df.fillna(' ', inplace=True) df.fillna('?', inplace=True)
dic_dft = df.to_dict(orient='list') dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft) return json.dumps(dic_dft)
...@@ -40,7 +39,8 @@ if __name__ == "__main__": ...@@ -40,7 +39,8 @@ if __name__ == "__main__":
data = get_test_data() data = get_test_data()
p = PredictorFac(model_config) p = PredictorFac(model_config)
res = p.predict(data) data = json.loads(data)
res = p.predict(data,predict_type='model')
print(res) print(res)
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
import re
from utils.log_manager import get_logger from utils.log_manager import get_logger
from utils.robots import dd_send_msg from utils.robots import dd_send_msg
import pandas as pd import pandas as pd
NUMBER_LEVEL = 0.7
SEQ_LEVEL = 0.5
class BasePredictor: class BasePredictor:
''' '''
预测类基类 预测类基类
...@@ -20,6 +22,7 @@ class BasePredictor: ...@@ -20,6 +22,7 @@ class BasePredictor:
self.robot_msg = dd_send_msg self.robot_msg = dd_send_msg
self.pd = pd self.pd = pd
def predict(self, key): def predict(self, key):
raise NotImplementedError raise NotImplementedError
...@@ -37,3 +40,47 @@ class BasePredictor: ...@@ -37,3 +40,47 @@ class BasePredictor:
deal_list = [round(i, 3) for i in proba[0].tolist()] deal_list = [round(i, 3) for i in proba[0].tolist()]
dic_proba = {k: v for k, v in zip(classes, deal_list)} dic_proba = {k: v for k, v in zip(classes, deal_list)}
return predictions[0], dic_proba return predictions[0], dic_proba
def isseq(self, data):
"""
序号列预测
"""
collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
if len(collect_seq) < 1 or not self.isIncrease(collect_seq, len(collect_seq)):
return False
else:
rate = round(len(collect_seq) / len(data), 3)
return True if rate >= SEQ_LEVEL else False
def isnum(self, data):
"""
数量列预测
"""
collect_num = [kw for kw in data if isinstance(kw, int) or self.isNumberCol(kw)]
rate = round(len(collect_num) / len(data), 3)
return True if rate >= NUMBER_LEVEL else False
def isIncrease(self, arr, size):
"""
判断列表元素是否递增
"""
if size == 1:
return True
return (arr[size - 1] >= arr[size - 2]) and self.isIncrease(arr, size - 1)
def isNumberCol(self, kw):
"""
是否是数量列
"""
if isinstance(kw, str):
return re.match(r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
else:
return False
def valid_seq(self, data):
"""取反"""
return not self.isseq(data)
def valid_num(self, data):
return not self.isnum(data)
\ No newline at end of file
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
from collections import Counter
from predict.base_handler import BasePredictor from predict.base_handler import BasePredictor
# 可能的头部字段 # 可能的头部字段
prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注", prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
"需求数量", "需求数量", "售价",
"参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"] "参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]
# 标准名和代名词的映射 # 标准名和代名词的映射
fields_map = {"序号": ["序号"], fields_map = {"序号": ["序号"],
"类别": ["类别", "分类", "名称", "类别名称"], "类别": ["类别", "分类", "名称", "类别名称"],
"参数": ["参数", "规格", "描述"], "参数": ["参数", "规格", "描述"],
"型号": ["型号", "参考料号", "料号", "MPN"], "型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"],
"数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"], "数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
"封装": ["封装", "封装规格"], "封装": ["封装", "封装规格"],
"品牌": ["品牌", "品牌/厂商", "参考品牌", "厂商编码", "参考供应商", "厂商", "参考供应商", "参考厂商"]} "品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
# 必须返回也必须验证的标准字段
MUST_STD_FIELDS = ['参数', '数量']
# #
order_list = ['序号'] order_list = ['序号']
en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}
def fun(seri): def fun(seri):
li_seri = seri.tolist() li_seri = seri.tolist()
...@@ -28,14 +32,18 @@ def fun(seri): ...@@ -28,14 +32,18 @@ def fun(seri):
# 取前多少行 # 取前多少行
HEAD_ROW = 5 HEAD_ROW = 7
# 空置率阈值
NAN_RATE = 0.8
# 占位符
PLACEHOLDER = '?'
class DicPredict(BasePredictor): class DicPredict(BasePredictor):
def id_by_field(self, df_head): def id_by_field(self, df_head):
""" """
:param df_head: 传入接收数据的头部Dataframe(默认5行) :param df_head: 传入接收数据的头部Dataframe(默认7行)
:return li_res: 返回结果列表 :return li_res: 返回结果列表
""" """
series = df_head.apply(fun) series = df_head.apply(fun)
...@@ -60,14 +68,6 @@ class DicPredict(BasePredictor): ...@@ -60,14 +68,6 @@ class DicPredict(BasePredictor):
columns = [] columns = []
li_data = [] li_data = []
for k, v in dict_data.items(): for k, v in dict_data.items():
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if not str(item).strip():
counter += 1
if counter / len(v) >= 0.8:
continue
columns.append(k) columns.append(k)
li_data.append(v) li_data.append(v)
...@@ -84,31 +84,89 @@ class DicPredict(BasePredictor): ...@@ -84,31 +84,89 @@ class DicPredict(BasePredictor):
else: else:
dic_ab_res = {i['column_name']: i['pronoun']} dic_ab_res = {i['column_name']: i['pronoun']}
ab_result.append(dic_ab_res) ab_result.append(dic_ab_res)
id_res = { pre_id_res = {
'std_result': std_result, 'std_result': std_result,
'ab_result': ab_result, 'ab_result': ab_result,
} }
return id_res return pre_id_res
def predict(self, dic_data): def model_predict(self, dic_data):
res = self.pre_predict(dic_data) """
if res: 该方法目前只对[参数, 型号, 数量, 品牌]进行预测
return res :param dic_data:
if len(dic_data) > 0: :return:
self.order_predict(dic_data[0]) """
pass print(dic_data)
prob_columns = []
def order_predict(self, data): temp_pre_model_res = {}
collect_num = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
judge = self.IsIncrease(collect_num, len(collect_num)) for k, v in dic_data.items():
print('judge: ' + str(judge)) bol = self.v_chain(v)
return judge if bol:
print(k, bol)
""" prob_columns.append(k)
判断列表元素是否递增 continue
""" if self.isnum(v):
temp_pre_model_res[k] = '数量'
def IsIncrease(self, arr, size): continue
if size == 1: if self.isseq(v):
temp_pre_model_res[k] = '序号'
temp_dic_data = {k: list(filter(lambda x: x != PLACEHOLDER, dic_data[k]))for k in prob_columns}
for k, v in temp_dic_data.items():
li_single_pred_res = []
for string in v:
single_pred_res, probdic = self.get_single_predict(string)
li_single_pred_res.append(single_pred_res)
result = Counter(li_single_pred_res)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(k, li_sort)
temp_pre_model_res[k] = en_to_zh_map[li_sort[0][0]]
print(temp_pre_model_res)
pre_model_res = {}
def predict(self, dic_data, predict_type='all'):
dic_data = self.pre_deal(dic_data)
if predict_type == 'all':
pre_id_res = self.pre_predict(dic_data)
if pre_id_res:
return pre_id_res
elif predict_type == 'model':
model_id_res = self.model_predict(dic_data)
if model_id_res:
return model_id_res
elif predict_type == 'pre':
pre_id_res = self.pre_predict(dic_data)
if pre_id_res:
return pre_id_res
def pre_deal(self, dic_data):
new_dic_data = {}
for k, v in dic_data.items():
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if str(item).strip() == PLACEHOLDER:
counter += 1
if counter / len(v) <= NAN_RATE:
new_dic_data[k] = v
return new_dic_data
def v_chain(self, li):
"""
验证链,验证方法中某个环节返回了False则返回False
:param li:
:return:
"""
for fun_name in dir(self):
if fun_name.startswith('valid_'):
fun = getattr(self, fun_name)
if not fun(li):
return False
else:
return True return True
return (arr[size - 1] >= arr[size - 2]) and self.IsIncrease(arr, size - 1)
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
import pandas as pd l=[' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
for v in l:
# 去掉空置率大于等于0.8的列
# def read_from_excel(file_name, sheet_name): counter = 0
# wb = xlrd.open_workbook(file_name) for item in v:
# sheet = wb.sheet_by_name(sheet_name) if str(item).strip() == PLACEHOLDER:
# row = sheet.nrows counter += 1
# col = sheet.ncols if counter / len(v) <= NAN_RATE:
# result_dict = {} new_dic_data[k] = v
# for i in range(col): \ No newline at end of file
# col_list = []
# for j in range(row): col_list.append(sheet.cell_value(j, i))
# result_dict[i] = col_list
# return result_dict
def read_from_excel(file_name, sheet_name):
data = pd.read_excel(file_name, sheet_name=sheet_name, keep_default_na=False, header=-1)
result_dict = {}
count = 0
for index in data.columns:
result_dict[count] = data[index].tolist()
count += 1
return result_dict
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment