Commit 94698314 by lichenggang

增加封装的识别

parent bb699679
......@@ -6,14 +6,13 @@ from utils.log_manager import bom_log
from protobuf import classify_pb2
from protobuf import classify_pb2_grpc
import traceback
from utils.config import model_config
from predic_fac import PredictorFac
class Classify(classify_pb2_grpc.classifyServicer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.predictorfac = PredictorFac(model_config)
self.predictorfac = PredictorFac()
def Classify(self, request, context):
bom_log.info(f'grpc收到数据: {request.keyword}')
......
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import sys, os
ENVIRONMENT = 'test' if sys.platform in ('darwin', 'win32') else 'produce'
model_config = {}
project_path = os.path.abspath(os.path.dirname(__file__))
if ENVIRONMENT == 'test':
model_config['model_path'] = r'model'
model_config['extractor_path'] = r'extractor'
model_config['modextr_path'] = project_path + '\\' + 'models_and_extractors\\'
else:
model_config['model_path'] = r'model'
model_config['extractor_path'] = r'extractor'
model_config['modextr_path'] = project_path + '\\' + 'models_and_extractors\\'
......@@ -8,21 +8,21 @@ import tornado.ioloop
import traceback
from utils.log_manager import bom_log
from utils.config import model_config
from predic_fac import PredictorFac
class BaseHandler(tornado.web.RequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.predictorfac = PredictorFac(model_config)
self.predictorfac = PredictorFac()
class KwHandler(BaseHandler):
async def get(self):
try:
target = unquote(self.get_argument('keyword'))
bom_log.info(f'http收到识别关键词: {target}')
res = self.predictorfac.predict(target)
predict_type = unquote(self.get_argument('type', 'all'))
bom_log.info(f'http收到识别关键词: {target}, 预测类型为{predict_type}')
res = self.predictorfac.predict(target, predict_type)
res['status'] = 1
except:
res = {
......
No preview for this file type
No preview for this file type
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from sklearn.externals import joblib
import pickle, json
import json
from predict import dict_predict, kw_predict
class PredictorFac():
def __init__(self, config):
with open(config['extractor_path'], 'rb') as f:
extractor = pickle.load(f)
classifier = joblib.load(config['model_path'])
self.kw_predictor = kw_predict.KwPredict('single', extractor, classifier)
self.dict_predictor = dict_predict.DicPredict('dict', extractor, classifier)
class PredictorFac:
def __init__(self):
self.kw_predictor = kw_predict.KwPredict('single')
self.dict_predictor = dict_predict.DicPredict('dict')
def _get_predictor(self, data):
......@@ -30,16 +27,13 @@ if __name__ == "__main__":
def get_test_data():
import pandas as pd
import json
df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\立创.xlsx', header=None)
df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\sky-新建 XLSX 工作表 (2).xlsx', header=None)
df.fillna('?', inplace=True)
dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft)
from utils.config import model_config
data = get_test_data()
p = PredictorFac(model_config)
p = PredictorFac()
data = json.loads(data)
pretty_col = {'第%s列' % k: v for k, v in data.items()}
print(pretty_col)
......
......@@ -7,7 +7,7 @@ from static_config import *
def fun(seri):
li_seri = seri.tolist()
for index, field in enumerate(li_seri):
if str(field).lower() in PROB_FIELDS:
if str(field).lower() in ALL_FIELDS:
return field, seri.name
......@@ -16,15 +16,16 @@ def get_head_row(li: list) -> int:
返回第一行有效数据所在的行
"""
for index, i in enumerate(li):
if str(i).lower() in PROB_FIELDS:
if str(i).lower() in ALL_FIELDS:
return index + 1
# 取前多少行
# 取前多少行做表头预测
HEAD_ROW = 7
# 空置率阈值
NAN_RATE = 0.8
# 参数列空置率阈值
PARAM_NAN_RATE = 0.3
class DicPredict(BasePredictor):
......@@ -38,7 +39,6 @@ class DicPredict(BasePredictor):
li_fie = series.tolist()
li_res = []
for field, column_name in li_fie:
print(field)
dic = {
'std_name': '',
'pronoun': field,
......@@ -102,9 +102,9 @@ class DicPredict(BasePredictor):
def model_predict(self, dic_data):
"""
该方法目前只对[序号(非标准), 数量, 类别, 参数, 型号, 品牌, other]进行预测, 前三者是非模型预测
模型预测目前只对[数量, 类别, 参数, 型号, 品牌, 封装, 序号, 位号, 单位]进行预测
:param dic_data:
:return :只有[参数, 数量]会强制有结果
:return :结果字典
"""
prob_columns = []
temp_pre_model_res = {}
......@@ -139,8 +139,8 @@ class DicPredict(BasePredictor):
prob_columns}
for k, v in set_not_null_dic_data.items():
li_single_pred_res = []
for string in v:
single_pred_res, probdic = self.get_single_predict(string)
for item in v:
single_pred_res, probdic = self.get_single_predict(item, BasePredictor.model, BasePredictor.extractor)
li_single_pred_res.append(single_pred_res)
result = BasePredictor.repeat_max(li_single_pred_res)
# 如果该列被预测为其他, 则不做改动
......@@ -149,28 +149,41 @@ class DicPredict(BasePredictor):
# 如果类别列被预测为品牌, 则不做改动, 此处是基于目前的模型容易把中文多的列预测为品牌, 无奈之下所做的逻辑
if temp_pre_model_res.get(k) == '类别' and result == 'brand_name':
continue
# 参数列单独要求空置率不能高于PARAM_NAN_RATE
if result == 'param':
if BasePredictor.get_nan_rate(v) >= PARAM_NAN_RATE:
continue
temp_pre_model_res[k] = EN_TO_ZH_MAP[result]
# 若有多个型号列,则进行不同率的比较, 不同率最高的选为目标列
not_null_dic_data = {k: list(filter(lambda x: x != BasePredictor.PLACEHOLDER, dic_data[k])) for k in
prob_columns}
# prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
# 若有多个型号列,则进行不同率的比较, 不同率最高的选为目标列
prob_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '型号']
for param_or_gn_col_list in [prob_gn_cols]:
if len(param_or_gn_col_list) >= 2:
li_diffrate = [(col, BasePredictor.get_diffrate(not_null_dic_data[col])) for col in
param_or_gn_col_list]
sort_li_diffrate = sorted(li_diffrate, key=lambda x: x[1], reverse=True)
for col_diffrate in sort_li_diffrate[1:]:
temp_pre_model_res.pop(col_diffrate[0])
# 若有多个参数列,则进行参数特征的数量比较, 特征最多的选为目标列
if len(prob_gn_cols) >= 2:
li_diffrate = [(col, BasePredictor.get_diffrate(not_null_dic_data[col])) for col in
prob_gn_cols]
sort_li_diffrate = sorted(li_diffrate, key=lambda x: x[1], reverse=True)
for col_diffrate in sort_li_diffrate[1:]:
temp_pre_model_res.pop(col_diffrate[0])
# 若有多个参数列, 先进行封装列的提取(封装率需要大于0), 再进行参数特征的数量比较, 特征最多的选为目标列
prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
self.info.info(f'可能的参数列有{str(prob_param_cols)}')
if len(prob_param_cols) >= 2:
li_encap_rate = [(col, BasePredictor.get_encap_rate(not_null_dic_data[col])) for col in
prob_param_cols]
sort_li_encaprate = sorted(li_encap_rate, key=lambda x: x[1], reverse=True)
if sort_li_encaprate[0][1] >= 0:
temp_pre_model_res[sort_li_encaprate[0][0]] = '封装'
prob_param_cols.remove(sort_li_encaprate[0][0])
li_feature_rate = [(col, BasePredictor.get_param_featurerate(not_null_dic_data[col])) for col in
prob_param_cols]
sort_li_fearate = sorted(li_feature_rate, key=lambda x: x[1], reverse=True)
for col_fearate in sort_li_fearate[1:]:
self.info.info(f'参数列丢弃{col_fearate[0]}')
temp_pre_model_res.pop(col_fearate[0])
# 若有多个数量列,则进行空置率的比较, 空置率最低的选为目标列, #TODO 后续可能需要改成数量元素的占比率
......@@ -178,11 +191,8 @@ class DicPredict(BasePredictor):
if len(prob_num_cols) >= 2:
li_nullrate = []
for prob_num_col in prob_num_cols:
counter = 0
for item in dic_data[prob_num_col]:
if str(item).strip() == BasePredictor.PLACEHOLDER:
counter += 1
li_nullrate.append((prob_num_col, counter / len(dic_data[prob_num_col])))
nanrate = BasePredictor.get_nan_rate(dic_data[prob_num_col])
li_nullrate.append((prob_num_col, nanrate))
sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
for col_nullrate in sort_li_nullrate[1:]:
temp_pre_model_res.pop(col_nullrate[0])
......@@ -192,11 +202,8 @@ class DicPredict(BasePredictor):
if len(prob_brand_cols) >= 2:
li_nullrate = []
for prob_brand_col in prob_brand_cols:
counter = 0
for item in dic_data[prob_brand_col]:
if str(item).strip() == BasePredictor.PLACEHOLDER:
counter += 1
li_nullrate.append((prob_brand_col, counter / len(dic_data[prob_brand_col])))
nanrate = BasePredictor.get_nan_rate(dic_data[prob_brand_col])
li_nullrate.append((prob_brand_col, nanrate))
sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
for col_nullrate in sort_li_nullrate[1:]:
temp_pre_model_res.pop(col_nullrate[0])
......
......@@ -7,8 +7,14 @@ class KwPredict(BasePredictor):
def predict(self, string, predict_type='all'):
self.info.info('预测类型为: %s, 接收数据: %s' % (predict_type, string))
result, prab = self.get_single_predict(string)
zh_prob = {EN_TO_ZH_MAP[k]: v for k, v in prab.items()}
if predict_type == 'all':
result, prab = self.get_single_predict(string, BasePredictor.model, BasePredictor.extractor)
zh_prob = {EN_TO_ZH_MAP[k]: v for k, v in prab.items()}
res = {'result': EN_TO_ZH_MAP[result], 'probably_dict': zh_prob, 'predict_type': predict_type}
else:
model = getattr(BasePredictor, predict_type + '_model')
extractor = getattr(BasePredictor, predict_type + '_extractor')
result, prab = self.get_single_predict(string, model, extractor)
res = {'result': int(result), 'probably_dict': prab, 'predict_type': predict_type}
res = {'result': EN_TO_ZH_MAP[result], 'probably_dict': zh_prob}
return res
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# 可能的头部字段
PROB_FIELDS = ["序号", "名称", "规格", "mpn", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
"需求数量", "售价", "封装", "封装规格", '型号', '参数', '数量', '品牌', '型号', '类型', "quantity", "qty", "buy qty",
"buy quantity", "需求用量","manufacturer制造商", "manufacturer", "description"
"参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面", "designator", "remark", "元器件",
"标号", "需求型号", "Footprint封装", "Footprint", "产品分类", "单板数量", "规格型号", "packagereference", "footprint封装"]
PROB_FIELDS_1 = ['*' + i for i in PROB_FIELDS]
PROB_FIELDS.extend(PROB_FIELDS_1)
# 标准名和代名词的映射
li_category = ["类别", "分类", "名称", "类别名称", "类型", "产品分类"]
li_param = ["参数", "规格", "描述", "值", "description"]
li_gn = ["型号", "参考料号", "料号", "mpn", "厂商编码", "元器件", "需求型号", "规格型号"]
li_num = ["数量", "用量(pcs)", "用量", "pcs", "quantity", "qty", "buy qty", "buy quantity", "需求用量", "单板数量"]
li_brand = ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商", "manufacturer制造商", "manufacturer"]
li_num = ["数量", "用量(pcs)", "用量", "pcs", "quantity", "qty", "buy qty", "buy quantity", "需求用量", "单板数量", "采购数量"]
li_brand = ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商", "manufacturer制造商", "manufacturer", "厂牌"]
li_encap = ["封装", "封装规格", "encapsulation", "footprint封装", 'packagereference']
li_category.extend(['*' + i for i in li_category])
li_param.extend(['*' + i for i in li_param])
li_gn.extend(['*' + i for i in li_gn])
li_num.extend(['*' + i for i in li_num])
li_brand.extend(['*' + i for i in li_brand])
li_encap.extend(['*' + i for i in li_encap])
PROB_FIELDS = ["序号", "位号", "a面位置", "位置", "b面位置", "备注", "售价", "item", "top面", "bottom面", "designator", "remark", "标号"]
AB_FIELDS = PROB_FIELDS + ['*' + i for i in PROB_FIELDS]
# 可能的头部字段
ALL_FIELDS = AB_FIELDS + li_category + li_param + li_gn + li_num + li_brand + li_encap
STD_FIELDS_MAP = {
"类别": li_category,
"参数": li_param,
"型号": li_gn,
"数量": li_num,
"品牌": li_brand
"品牌": li_brand,
"封装": li_encap
}
# 必须返回也必须验证的标准字段
MUST_STD_FIELDS = ['参数', '数量']
# 参数名和中文的映射
EN_TO_ZH_MAP = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号', 'other': '其他'}
EN_TO_ZH_MAP = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号', 'other': '其他', 'encap': '封装'}
# 类别合集 从learning_data.lie_category导入, 并添加了部分短英文Category
CATEGORY = ["半导体", "嵌入式", "光电子", "光源", "无源", "连接器", "断路器", "指示灯", "声源", "接触器", "铁氧芯", "冷热系统", "电源", "电线", "机械",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment