Commit 4bd8fa67 by lzzzzl

数量新增识别小数列

parents 17dc32e3 b2d26416
...@@ -4,8 +4,7 @@ from protobuf import classify_pb2_grpc ...@@ -4,8 +4,7 @@ from protobuf import classify_pb2_grpc
import pandas as pd import pandas as pd
import json import json
def get_test_data(): def get_test_data():
df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx', header=None) df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None)
print(df)
df.fillna(' ', inplace=True) df.fillna(' ', inplace=True)
dic_dft = df.to_dict(orient='list') dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft) return json.dumps(dic_dft)
......
No preview for this file type
No preview for this file type
...@@ -29,7 +29,7 @@ if __name__ == "__main__": ...@@ -29,7 +29,7 @@ if __name__ == "__main__":
def get_test_data(): def get_test_data():
import pandas as pd import pandas as pd
import json import json
df = pd.read_excel(r'C:\data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='1') df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='3')
df.fillna('?', inplace=True) df.fillna('?', inplace=True)
dic_dft = df.to_dict(orient='list') dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft) return json.dumps(dic_dft)
...@@ -40,7 +40,6 @@ if __name__ == "__main__": ...@@ -40,7 +40,6 @@ if __name__ == "__main__":
data = get_test_data() data = get_test_data()
p = PredictorFac(model_config) p = PredictorFac(model_config)
data = json.loads(data) data = json.loads(data)
res = p.predict(data,predict_type='model') print(data)
res = p.predict(data,predict_type='all')
print(res) print(res)
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
import re import re
from collections import Counter
from utils.log_manager import get_logger from utils.log_manager import get_logger
from utils.robots import dd_send_msg from utils.robots import dd_send_msg
import pandas as pd import pandas as pd
NUMBER_LEVEL = 0.7 CATEGORY = ['二极管']
RIGHT_LEVEL = 0.7
SEQ_LEVEL = 0.5 SEQ_LEVEL = 0.5
CATE_LEVEL = 0.5
MULTI_SAME_LEVEL = 3
class BasePredictor: class BasePredictor:
''' '''
预测类基类 预测类基类
''' '''
# 占位符
PLACEHOLDER = '?'
def __init__(self, name, extractor, classifier): def __init__(self, name, extractor, classifier):
self.name = name self.name = name
self.classifier = classifier self.classifier = classifier
...@@ -22,16 +31,16 @@ class BasePredictor: ...@@ -22,16 +31,16 @@ class BasePredictor:
self.robot_msg = dd_send_msg self.robot_msg = dd_send_msg
self.pd = pd self.pd = pd
def predict(self, key): def predict(self, key):
raise NotImplementedError raise NotImplementedError
def get_single_predict(self, string: object) -> tuple: def get_single_predict(self, obj: object) -> tuple:
''' '''
:param string: 接收单个要判断的字符串 :param string: 接收单个要判断的字符串
:return tuple: 返回两个元素的元组, 第一个元素为判断结果, 第二个元素为可能性的字典 :return tuple: 返回两个元素的元组, 第一个元素为判断结果, 第二个元素为可能性的字典
''' '''
string = str(obj)
series = self.pd.Series([string]) series = self.pd.Series([string])
feature = self.extractor.transform(series) feature = self.extractor.transform(series)
predictions = self.classifier.predict(feature) predictions = self.classifier.predict(feature)
...@@ -41,46 +50,80 @@ class BasePredictor: ...@@ -41,46 +50,80 @@ class BasePredictor:
dic_proba = {k: v for k, v in zip(classes, deal_list)} dic_proba = {k: v for k, v in zip(classes, deal_list)}
return predictions[0], dic_proba return predictions[0], dic_proba
def isseq(self, data): def valid_seq(self, data):
""" """取反"""
序号列预测 return not self.is_seq(data)
"""
collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)] def valid_num(self, data):
if len(collect_seq) < 1 or not self.isIncrease(collect_seq, len(collect_seq)): return not self.is_num(data)
return False
else:
rate = round(len(collect_seq) / len(data), 3)
return True if rate >= SEQ_LEVEL else False
def isnum(self, data): def valid_cate(self, data):
return not self.is_catecol(data)
@classmethod
def is_num(self, data):
""" """
数量列预测 数量列预测
""" """
collect_num = [kw for kw in data if isinstance(kw, int) or self.isNumberCol(kw)]
def isNumberCol(kw):
"""
是否是数量列辅助函数
"""
if isinstance(kw, str):
return re.match(r'(\d+|\d+(\.\d+))($|(K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
else:
return False
collect_num = [kw for kw in data if isinstance(kw, int) or isinstance(kw, float) or isNumberCol(str(kw))]
rate = round(len(collect_num) / len(data), 3) rate = round(len(collect_num) / len(data), 3)
return True if rate >= NUMBER_LEVEL else False return True if rate >= RIGHT_LEVEL else False
def isIncrease(self, arr, size): @classmethod
""" def is_catecol(self, data):
判断列表元素是否递增 cates = []
""" for i in data:
if size == 1: for j in CATEGORY:
return True if j in str(i):
return (arr[size - 1] >= arr[size - 2]) and self.isIncrease(arr, size - 1) cates.append(i)
rate = round(len(cates) / len(data), 3)
return rate >= CATE_LEVEL
@classmethod
def is_multi_same(self, data):
no_null_data = list(filter(lambda x: x != self.PLACEHOLDER, data))
result = Counter(no_null_data)
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
return li_sort[0][1] >= MULTI_SAME_LEVEL
def isNumberCol(self, kw): @classmethod
def is_seq(self, data):
""" """
是否是数量列 序号列预测
""" """
if isinstance(kw, str):
return re.match(r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I) def isIncrease(arr, size):
else: """
判断列表元素是否递增
"""
if size == 1:
return True
return (arr[size - 1] >= arr[size - 2]) and isIncrease(arr, size - 1)
collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
if len(collect_seq) < 1 or not isIncrease(collect_seq, len(collect_seq)):
return False return False
else:
rate = round(len(collect_seq) / len(data), 3)
return True if rate >= SEQ_LEVEL else False
def valid_seq(self, data):
"""取反"""
return not self.isseq(data)
def valid_num(self, data): if __name__ == "__main__":
return not self.isnum(data) li = ['?', 3400.0, 5920.0, 4849.0, 2544.0, 3270.0, 52751.0, 2031.0, 5302.0, 726.0, 1247.0, 2472.0, 689.0, 6049.0,
\ No newline at end of file 26796.0, 6164.0, 1605.0, 4346.0, 640.0, 960.0, 960.0, 320.0, 160.0, 860.0, 160.0, 320.0, 3183.0, 10151.0,
640.0, 130.0, 1237.0, 800.0, 960.0, 3740.0, 17701.0, 2146.0, 1280.0, 160.0, 1120.0, 160.0, 480.0, 960.0,
480.0, 160.0, 4717.0, 160.0, 160.0, 160.0, 640.0, 160.0, 320.0, 160.0, 160.0, 800.0, 800.0, 480.0, 1600.0,
155.0, 960.0, 320.0, 944.0, 160.0, 160.0, 1280.0, 1852.0, 7680.0, 7680.0, 2880.0, 160.0, 224.0, 480.0, 480.0,
640.0, 160.0, 640.0, 320.0, 1760.0, 640.0, 480.0, 960.0, 160.0, 160.0, 160.0, 160.0, 1920.0, 160.0, 5600.0,
480.0, 2560.0, 160.0, 160.0, 160.0, 160.0, 160.0, 1280.0, 160.0, 160.0, 160.0, 160.0, 160.0, 320.0, 0.0,
160.0, 160.0]
print(BasePredictor.is_num(li))
...@@ -5,38 +5,42 @@ from collections import Counter ...@@ -5,38 +5,42 @@ from collections import Counter
from predict.base_handler import BasePredictor from predict.base_handler import BasePredictor
# 可能的头部字段 # 可能的头部字段
prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注", PROB_FIELDS = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
"需求数量", "售价", "需求数量", "售价", "封装", "封装规格",
"参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"] "参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]
# 标准名和代名词的映射 # 标准名和代名词的映射
fields_map = {"序号": ["序号"], STD_FIELDS_MAP = {
"类别": ["类别", "分类", "名称", "类别名称"], "类别": ["类别", "分类", "名称", "类别名称"],
"参数": ["参数", "规格", "描述"], "参数": ["参数", "规格", "描述", "值"],
"型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"], "型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"],
"数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"], "数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
"封装": ["封装", "封装规格"],
"品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]} "品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
# 必须返回也必须验证的标准字段 # 必须返回也必须验证的标准字段
MUST_STD_FIELDS = ['参数', '数量'] MUST_STD_FIELDS = ['参数', '数量']
# #
order_list = ['序号'] order_list = ['序号']
en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'} en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}
def fun(seri): def fun(seri):
li_seri = seri.tolist() li_seri = seri.tolist()
for field in li_seri: for field in li_seri:
if str(field).lower() in prob_fields: if str(field).lower() in PROB_FIELDS:
return field, seri.name return field, seri.name
def repeat_max(li):
result = Counter(li)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
return li_sort[0][0]
# 取前多少行 # 取前多少行
HEAD_ROW = 7 HEAD_ROW = 7
# 空置率阈值 # 空置率阈值
NAN_RATE = 0.8 NAN_RATE = 0.8
# 占位符
PLACEHOLDER = '?'
class DicPredict(BasePredictor): class DicPredict(BasePredictor):
...@@ -56,14 +60,14 @@ class DicPredict(BasePredictor): ...@@ -56,14 +60,14 @@ class DicPredict(BasePredictor):
'pronoun': field, 'pronoun': field,
'column_name': column_name 'column_name': column_name
} }
for k, v in fields_map.items(): for k, v in STD_FIELDS_MAP.items():
if field.lower() in fields_map[k]: if field.lower() in STD_FIELDS_MAP[k]:
dic['std_name'] = k dic['std_name'] = k
li_res.append(dic) li_res.append(dic)
return li_res return li_res
def pre_predict(self, dict_data): def head_predict(self, dict_data):
columns = [] columns = []
li_data = [] li_data = []
...@@ -75,15 +79,13 @@ class DicPredict(BasePredictor): ...@@ -75,15 +79,13 @@ class DicPredict(BasePredictor):
dft = df.T.head(HEAD_ROW) dft = df.T.head(HEAD_ROW)
dft.columns = columns dft.columns = columns
li_res_raw = self.id_by_field(dft) li_res_raw = self.id_by_field(dft)
std_result = [] std_result = {}
ab_result = [] ab_result = {}
for i in li_res_raw: for i in li_res_raw:
if i.get('std_name'): if i.get('std_name'):
dic_has_res = {i['column_name']: i['std_name']} std_result[i['column_name']] = i['std_name']
std_result.append(dic_has_res)
else: else:
dic_ab_res = {i['column_name']: i['pronoun']} ab_result[i['column_name']] = i['pronoun']
ab_result.append(dic_ab_res)
pre_id_res = { pre_id_res = {
'std_result': std_result, 'std_result': std_result,
'ab_result': ab_result, 'ab_result': ab_result,
...@@ -92,54 +94,73 @@ class DicPredict(BasePredictor): ...@@ -92,54 +94,73 @@ class DicPredict(BasePredictor):
def model_predict(self, dic_data): def model_predict(self, dic_data):
""" """
该方法目前只对[参数, 型号, 数量, 品牌]进行预测 该方法目前只对[序号(非标准), 数量, 类别, 参数, 型号, 品牌]进行预测, 前三者是非模型预测
:param dic_data: :param dic_data:
:return: :return :只有[参数, 数量]会强制有结果
""" """
print(dic_data)
prob_columns = [] prob_columns = []
temp_pre_model_res = {} temp_pre_model_res = {}
ab_result = {}
for k, v in dic_data.items(): for k, v in dic_data.items():
bol = self.v_chain(v) bol = self.v_chain(v)
if bol: if bol:
print(k, bol)
prob_columns.append(k) prob_columns.append(k)
continue continue
if self.isnum(v): if self.is_seq(v):
ab_result[k] = '序号'
continue
if self.is_num(v):
temp_pre_model_res[k] = '数量' temp_pre_model_res[k] = '数量'
continue continue
if self.isseq(v): if self.is_catecol(v):
temp_pre_model_res[k] = '序号' temp_pre_model_res[k] = '类别'
temp_dic_data = {k: list(filter(lambda x: x != PLACEHOLDER, dic_data[k]))for k in prob_columns} continue
temp_dic_data = {k: list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k])) for k in prob_columns}
for k, v in temp_dic_data.items(): for k, v in temp_dic_data.items():
li_single_pred_res = [] li_single_pred_res = []
for string in v: for string in v:
single_pred_res, probdic = self.get_single_predict(string) single_pred_res, probdic = self.get_single_predict(string)
li_single_pred_res.append(single_pred_res) li_single_pred_res.append(single_pred_res)
result = Counter(li_single_pred_res) result = repeat_max(li_single_pred_res)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)] temp_pre_model_res[k] = en_to_zh_map[result]
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(k, li_sort) # 参数和型号列出现多条相同值则丢弃
temp_pre_model_res[k] = en_to_zh_map[li_sort[0][0]] prob_param_and_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数' or temp_pre_model_res[i] == '型号']
print(temp_pre_model_res) for col in prob_param_and_gn_cols:
if self.is_multi_same(temp_dic_data[col]):
pre_model_res = {} temp_pre_model_res.pop(col)
model_id_res = {
'std_result': temp_pre_model_res,
'ab_result': ab_result,
}
return model_id_res
def predict(self, dic_data, predict_type='all'): def predict(self, dic_data, predict_type='all'):
dic_data = self.pre_deal(dic_data) dic_data = self.pre_deal(dic_data)
if predict_type == 'all': if predict_type == 'all':
pre_id_res = self.pre_predict(dic_data) pre_id_res = self.head_predict(dic_data)
if pre_id_res: model_id_res = self.model_predict(dic_data)
return pre_id_res
# 表头预测和模型预测最后返回的数据进行综合处理
pre_std_result = pre_id_res.get('std_result')
model_std_result = model_id_res.get('std_result')
comprehensive_res = self.get_comprehensive_res(pre_std_result, model_std_result)
if comprehensive_res:
res = {
'std_result': comprehensive_res,
'ab_result': pre_id_res['ab_result'],
}
return res
elif predict_type == 'model': elif predict_type == 'model':
model_id_res = self.model_predict(dic_data) model_id_res = self.model_predict(dic_data)
if model_id_res: if model_id_res:
return model_id_res return model_id_res
elif predict_type == 'pre':
pre_id_res = self.pre_predict(dic_data) elif predict_type == 'head':
pre_id_res = self.head_predict(dic_data)
if pre_id_res: if pre_id_res:
return pre_id_res return pre_id_res
...@@ -149,7 +170,7 @@ class DicPredict(BasePredictor): ...@@ -149,7 +170,7 @@ class DicPredict(BasePredictor):
# 去掉空置率大于等于0.8的列 # 去掉空置率大于等于0.8的列
counter = 0 counter = 0
for item in v: for item in v:
if str(item).strip() == PLACEHOLDER: if str(item).strip() == self.PLACEHOLDER:
counter += 1 counter += 1
if counter / len(v) <= NAN_RATE: if counter / len(v) <= NAN_RATE:
new_dic_data[k] = v new_dic_data[k] = v
...@@ -170,3 +191,23 @@ class DicPredict(BasePredictor): ...@@ -170,3 +191,23 @@ class DicPredict(BasePredictor):
else: else:
return True return True
def get_comprehensive_res(self, pre_std_result, model_std_result):
vote_count = {
"类别": [],
"参数": [],
"型号": [],
"数量": [],
"品牌": []
}
for k, v in pre_std_result.items():
vote_count[v].append(k)
for k, v in model_std_result.items():
vote_count[v].append(k)
comprehensive_res = {}
for std_name, col_li in vote_count.items():
if len(col_li) >= 1:
col = repeat_max(col_li)
comprehensive_res[col] = std_name
return comprehensive_res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment