Commit f055b375 by lichenggang

1. 增加other预测

2. 多列被预测为参数或型号则选不同率最高的列为目标列,其余的丢弃
3. 验证链取消类别验证
parent 4e418107
No preview for this file type
No preview for this file type
......@@ -57,8 +57,8 @@ class BasePredictor:
def valid_num(self, data):
return not self.is_num(data)
def valid_cate(self, data):
return not self.is_catecol(data)
# def valid_cate(self, data):
# return not self.is_catecol(data)
@classmethod
def is_num(self, data):
......@@ -74,6 +74,7 @@ class BasePredictor:
return re.match(r'(\d+|\d+(\.\d+))($|(K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
else:
return False
collect_num = [kw for kw in data if isinstance(kw, int) or isinstance(kw, float) or isNumberCol(str(kw))]
rate = round(len(collect_num) / len(data), 3)
return True if rate >= RIGHT_LEVEL else False
......@@ -85,6 +86,7 @@ class BasePredictor:
for j in CATEGORY:
if j in str(i):
cates.append(i)
break
rate = round(len(cates) / len(data), 3)
return rate >= CATE_LEVEL
......@@ -116,14 +118,14 @@ class BasePredictor:
rate = round(len(collect_seq) / len(data), 3)
return True if rate >= SEQ_LEVEL else False
@classmethod
def get_diffrate(self, data):
"""
得到不同率
"""
rate = round(len(set(data)) / len(data), 3)
return rate
if __name__ == "__main__":
li = ['?', 3400.0, 5920.0, 4849.0, 2544.0, 3270.0, 52751.0, 2031.0, 5302.0, 726.0, 1247.0, 2472.0, 689.0, 6049.0,
26796.0, 6164.0, 1605.0, 4346.0, 640.0, 960.0, 960.0, 320.0, 160.0, 860.0, 160.0, 320.0, 3183.0, 10151.0,
640.0, 130.0, 1237.0, 800.0, 960.0, 3740.0, 17701.0, 2146.0, 1280.0, 160.0, 1120.0, 160.0, 480.0, 960.0,
480.0, 160.0, 4717.0, 160.0, 160.0, 160.0, 640.0, 160.0, 320.0, 160.0, 160.0, 800.0, 800.0, 480.0, 1600.0,
155.0, 960.0, 320.0, 944.0, 160.0, 160.0, 1280.0, 1852.0, 7680.0, 7680.0, 2880.0, 160.0, 224.0, 480.0, 480.0,
640.0, 160.0, 640.0, 320.0, 1760.0, 640.0, 480.0, 960.0, 160.0, 160.0, 160.0, 160.0, 1920.0, 160.0, 5600.0,
480.0, 2560.0, 160.0, 160.0, 160.0, 160.0, 160.0, 1280.0, 160.0, 160.0, 160.0, 160.0, 160.0, 320.0, 0.0,
160.0, 160.0]
print(BasePredictor.is_num(li))
li = ['型号', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电阻', '电容', '电容', '电容', '电容', '电容', '电容', '电容', '电容', '电容', '电感', '电感', '电感', '电感', '电感', '二极管', '二极管', '二极管', '二极管', '二极管', '二极管', '二极管', '场效应管', '场效应管', '场效应管', '场效应管', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '集成电路', '振荡器', '振荡器', '光电器件', '光电器件', '光电器件', '磁珠', '保险丝', '保险丝', '保险丝', '开关元件', '继电器', '继电器', '继电器', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '接插件', '模块', '模块', '模块']
print(set(li))
......@@ -91,7 +91,7 @@ class DicPredict(BasePredictor):
def model_predict(self, dic_data):
"""
该方法目前只对[序号(非标准), 数量, 类别, 参数, 型号, 品牌]进行预测, 前三者是非模型预测
该方法目前只对[序号(非标准), 数量, 类别, 参数, 型号, 品牌, other]进行预测, 前三者是非模型预测
:param dic_data:
:return :只有[参数, 数量]会强制有结果
"""
......@@ -107,12 +107,13 @@ class DicPredict(BasePredictor):
if self.is_num(no_null_v):
temp_pre_model_res[k] = '数量'
continue
if bol:
prob_columns.append(k)
continue
# continue
if self.is_catecol(no_null_v):
temp_pre_model_res[k] = '类别'
continue
not_null_dic_data = {k: list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k])) for k in prob_columns}
for k, v in not_null_dic_data.items():
li_single_pred_res = []
......@@ -120,14 +121,20 @@ class DicPredict(BasePredictor):
single_pred_res, probdic = self.get_single_predict(string)
li_single_pred_res.append(single_pred_res)
result = repeat_max(li_single_pred_res)
# 如果该列被预测为其他, 则不做改动
if result == 'other':
continue
temp_pre_model_res[k] = EN_TO_ZH_MAP[result]
# 参数和型号列出现多条相同值则丢弃
prob_param_and_gn_cols = [i for i in temp_pre_model_res if
temp_pre_model_res[i] == '参数' or temp_pre_model_res[i] == '型号']
for col in prob_param_and_gn_cols:
if self.is_multi_same(not_null_dic_data[col]):
temp_pre_model_res.pop(col)
# 若有多个参数列或型号列,则进行不同率的比较, 不同率最高的选为目标列
prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
prob_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '型号']
for param_or_gn_col_list in [prob_param_cols, prob_gn_cols]:
if len(param_or_gn_col_list) >= 2:
li_diffrate = [(col, BasePredictor.get_diffrate(not_null_dic_data[col])) for col in param_or_gn_col_list]
sort_li_diffrate = sorted(li_diffrate, key=lambda x: x[1], reverse=True)
for col_diffrate in sort_li_diffrate[1:]:
temp_pre_model_res.pop(col_diffrate[0])
model_id_res = {
'std_result': temp_pre_model_res,
......@@ -192,6 +199,8 @@ class DicPredict(BasePredictor):
return True
def get_comprehensive_res(self, pre_std_result, model_std_result):
print('表头预测结果', pre_std_result)
print('模型预测结果', model_std_result)
vote_count = {
"类别": [],
"参数": [],
......
......@@ -17,7 +17,7 @@ STD_FIELDS_MAP = {
MUST_STD_FIELDS = ['参数', '数量']
# 参数名和中文的映射
EN_TO_ZH_MAP = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}
EN_TO_ZH_MAP = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号', 'other': '其他'}
# 类别合集 从learning_data.lie_category导入
CATEGORY = ["半导体", "嵌入式", "光电子", "光源", "无源", "连接器", "断路器", "指示灯", "声源", "接触器", "铁氧芯", "冷热系统", "电源", "电线", "机械",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment