数量新增识别小数列

4bd8fa67 · lzzzzl · 17dc32e3 · b2d26416 · 4bd8fa67 · 4bd8fa67
Commit 4bd8fa67 authored Jun 01, 2020 by lzzzzl
Showing with 161 additions and 80 deletions
client/client.py
extractor
model
predic_fac.py
predict/base_handler.py
predict/dict_predict.py
--- a/client/client.py
+++ b/client/client.py
@@ -4,8 +4,7 @@ from protobuf import classify_pb2_grpc
 import pandas as pd
 import json
 def get_test_data():
-    df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx', header=None)
+    df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None)
-    print(df)
    df.fillna(' ', inplace=True)
    dic_dft = df.to_dict(orient='list')
    return json.dumps(dic_dft)

--- a/extractor
+++ b/extractor
--- a/model
+++ b/model
--- a/predic_fac.py
+++ b/predic_fac.py
@@ -29,7 +29,7 @@ if __name__ == "__main__":
    def get_test_data():
        import pandas as pd
        import json
-        df = pd.read_excel(r'C:\data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='1')
+        df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='3')
        df.fillna('?', inplace=True)
        dic_dft = df.to_dict(orient='list')
        return json.dumps(dic_dft)
@@ -40,7 +40,6 @@ if __name__ == "__main__":
    data = get_test_data()
    p = PredictorFac(model_config)
    data = json.loads(data)
-    res = p.predict(data,predict_type='model')
+    print(data)
+    res = p.predict(data,predict_type='all')
    print(res)
--- a/predict/base_handler.py
+++ b/predict/base_handler.py
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import re
+from collections import Counter
 from utils.log_manager import get_logger
 from utils.robots import dd_send_msg
 import pandas as pd
-NUMBER_LEVEL = 0.7
+CATEGORY = ['二极管']
+RIGHT_LEVEL = 0.7
 SEQ_LEVEL = 0.5
+CATE_LEVEL = 0.5
+MULTI_SAME_LEVEL = 3
 class BasePredictor:
    '''
        预测类基类
    '''
+    # 占位符
+    PLACEHOLDER = '?'
    def __init__(self, name, extractor, classifier):
        self.name = name
        self.classifier = classifier
@@ -22,16 +31,16 @@ class BasePredictor:
        self.robot_msg = dd_send_msg
        self.pd = pd
    def predict(self, key):
        raise NotImplementedError
-    def get_single_predict(self, string: object) -> tuple:
+    def get_single_predict(self, obj: object) -> tuple:
        '''
        :param string: 接收单个要判断的字符串
        :return tuple: 返回两个元素的元组, 第一个元素为判断结果, 第二个元素为可能性的字典
        '''
+        string = str(obj)
        series = self.pd.Series([string])
        feature = self.extractor.transform(series)
        predictions = self.classifier.predict(feature)
@@ -41,46 +50,80 @@ class BasePredictor:
        dic_proba = {k: v for k, v in zip(classes, deal_list)}
        return predictions[0], dic_proba
-    def isseq(self, data):
+    def valid_seq(self, data):
-        """
+        """取反"""
-        序号列预测
+        return not self.is_seq(data)
-        """
-        collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
+    def valid_num(self, data):
-        if len(collect_seq) < 1 or not self.isIncrease(collect_seq, len(collect_seq)):
+        return not self.is_num(data)
-            return False
-        else:
-            rate = round(len(collect_seq) / len(data), 3)
-            return True if rate >= SEQ_LEVEL else False
-    def isnum(self, data):
+    def valid_cate(self, data):
+        return not self.is_catecol(data)
+    @classmethod
+    def is_num(self, data):
        """
        数量列预测
        """
-        collect_num = [kw for kw in data if isinstance(kw, int) or self.isNumberCol(kw)]
+        def isNumberCol(kw):
+            """
+            是否是数量列辅助函数
+            """
+            if isinstance(kw, str):
+                return re.match(r'(\d+|\d+(\.\d+))($|(K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
+            else:
+                return False
+        collect_num = [kw for kw in data if isinstance(kw, int) or isinstance(kw, float) or isNumberCol(str(kw))]
        rate = round(len(collect_num) / len(data), 3)
-        return True if rate >= NUMBER_LEVEL else False
+        return True if rate >= RIGHT_LEVEL else False
-    def isIncrease(self, arr, size):
+    @classmethod
-        """
+    def is_catecol(self, data):
-        判断列表元素是否递增
+        cates = []
-        """
+        for i in data:
-        if size == 1:
+            for j in CATEGORY:
-            return True
+                if j in str(i):
-        return (arr[size - 1] >= arr[size - 2]) and self.isIncrease(arr, size - 1)
+                    cates.append(i)
+        rate = round(len(cates) / len(data), 3)
+        return rate >= CATE_LEVEL
+    @classmethod
+    def is_multi_same(self, data):
+        no_null_data = list(filter(lambda x: x != self.PLACEHOLDER, data))
+        result = Counter(no_null_data)
+        li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
+        return li_sort[0][1] >= MULTI_SAME_LEVEL
-    def isNumberCol(self, kw):
+    @classmethod
+    def is_seq(self, data):
        """
-        是否是数量列
+        序号列预测
        """
-        if isinstance(kw, str):
-            return re.match(r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
+        def isIncrease(arr, size):
-        else:
+            """
+            判断列表元素是否递增
+            """
+            if size == 1:
+                return True
+            return (arr[size - 1] >= arr[size - 2]) and isIncrease(arr, size - 1)
+        collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
+        if len(collect_seq) < 1 or not isIncrease(collect_seq, len(collect_seq)):
            return False
+        else:
+            rate = round(len(collect_seq) / len(data), 3)
+            return True if rate >= SEQ_LEVEL else False
-    def valid_seq(self, data):
-        """取反"""
-        return not self.isseq(data)
-    def valid_num(self, data):
+if __name__ == "__main__":
-        return not self.isnum(data)
+    li = ['?', 3400.0, 5920.0, 4849.0, 2544.0, 3270.0, 52751.0, 2031.0, 5302.0, 726.0, 1247.0, 2472.0, 689.0, 6049.0,
\ No newline at end of file
+          26796.0, 6164.0, 1605.0, 4346.0, 640.0, 960.0, 960.0, 320.0, 160.0, 860.0, 160.0, 320.0, 3183.0, 10151.0,
+          640.0, 130.0, 1237.0, 800.0, 960.0, 3740.0, 17701.0, 2146.0, 1280.0, 160.0, 1120.0, 160.0, 480.0, 960.0,
+          480.0, 160.0, 4717.0, 160.0, 160.0, 160.0, 640.0, 160.0, 320.0, 160.0, 160.0, 800.0, 800.0, 480.0, 1600.0,
+          155.0, 960.0, 320.0, 944.0, 160.0, 160.0, 1280.0, 1852.0, 7680.0, 7680.0, 2880.0, 160.0, 224.0, 480.0, 480.0,
+          640.0, 160.0, 640.0, 320.0, 1760.0, 640.0, 480.0, 960.0, 160.0, 160.0, 160.0, 160.0, 1920.0, 160.0, 5600.0,
+          480.0, 2560.0, 160.0, 160.0, 160.0, 160.0, 160.0, 1280.0, 160.0, 160.0, 160.0, 160.0, 160.0, 320.0, 0.0,
+          160.0, 160.0]
+    print(BasePredictor.is_num(li))
--- a/predict/dict_predict.py
+++ b/predict/dict_predict.py
@@ -5,38 +5,42 @@ from collections import Counter
 from predict.base_handler import BasePredictor
 # 可能的头部字段
-prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
+PROB_FIELDS = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
-               "需求数量", "售价",
+               "需求数量", "售价", "封装", "封装规格",
               "参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]
 # 标准名和代名词的映射
-fields_map = {"序号": ["序号"],
+STD_FIELDS_MAP = {
              "类别": ["类别", "分类", "名称", "类别名称"],
-              "参数": ["参数", "规格", "描述"],
+              "参数": ["参数", "规格", "描述", "值"],
              "型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"],
              "数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
-              "封装": ["封装", "封装规格"],
              "品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
 # 必须返回也必须验证的标准字段
 MUST_STD_FIELDS = ['参数', '数量']
 #
 order_list = ['序号']
 en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}
 def fun(seri):
    li_seri = seri.tolist()
    for field in li_seri:
-        if str(field).lower() in prob_fields:
+        if str(field).lower() in PROB_FIELDS:
            return field, seri.name
+def repeat_max(li):
+    result = Counter(li)
+    # [('brand_name', 4), ('goods_name', 3), ('param', 2)]
+    li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
+    return li_sort[0][0]
 # 取前多少行
 HEAD_ROW = 7
 # 空置率阈值
 NAN_RATE = 0.8
-# 占位符
-PLACEHOLDER = '?'
 class DicPredict(BasePredictor):
@@ -56,14 +60,14 @@ class DicPredict(BasePredictor):
                'pronoun': field,
                'column_name': column_name
            }
-            for k, v in fields_map.items():
+            for k, v in STD_FIELDS_MAP.items():
-                if field.lower() in fields_map[k]:
+                if field.lower() in STD_FIELDS_MAP[k]:
                    dic['std_name'] = k
            li_res.append(dic)
        return li_res
-    def pre_predict(self, dict_data):
+    def head_predict(self, dict_data):
        columns = []
        li_data = []
@@ -75,15 +79,13 @@ class DicPredict(BasePredictor):
        dft = df.T.head(HEAD_ROW)
        dft.columns = columns
        li_res_raw = self.id_by_field(dft)
-        std_result = []
+        std_result = {}
-        ab_result = []
+        ab_result = {}
        for i in li_res_raw:
            if i.get('std_name'):
-                dic_has_res = {i['column_name']: i['std_name']}
+                std_result[i['column_name']] = i['std_name']
-                std_result.append(dic_has_res)
            else:
-                dic_ab_res = {i['column_name']: i['pronoun']}
+                ab_result[i['column_name']] = i['pronoun']
-                ab_result.append(dic_ab_res)
        pre_id_res = {
            'std_result': std_result,
            'ab_result': ab_result,
@@ -92,54 +94,73 @@ class DicPredict(BasePredictor):
    def model_predict(self, dic_data):
        """
-        该方法目前只对[参数, 型号, 数量, 品牌]进行预测
+        该方法目前只对[序号(非标准), 数量, 类别, 参数, 型号, 品牌]进行预测, 前三者是非模型预测
        :param dic_data:
-        :return:
+        :return :只有[参数, 数量]会强制有结果
        """
-        print(dic_data)
        prob_columns = []
        temp_pre_model_res = {}
+        ab_result = {}
        for k, v in dic_data.items():
            bol = self.v_chain(v)
            if bol:
-                print(k, bol)
                prob_columns.append(k)
                continue
-            if self.isnum(v):
+            if self.is_seq(v):
+                ab_result[k] = '序号'
+                continue
+            if self.is_num(v):
                temp_pre_model_res[k] = '数量'
                continue
-            if self.isseq(v):
+            if self.is_catecol(v):
-                temp_pre_model_res[k] = '序号'
+                temp_pre_model_res[k] = '类别'
-        temp_dic_data = {k: list(filter(lambda x: x != PLACEHOLDER, dic_data[k]))for k in prob_columns}
+                continue
+        temp_dic_data = {k: list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k])) for k in prob_columns}
        for k, v in temp_dic_data.items():
            li_single_pred_res = []
            for string in v:
                single_pred_res, probdic = self.get_single_predict(string)
                li_single_pred_res.append(single_pred_res)
-            result = Counter(li_single_pred_res)
+            result = repeat_max(li_single_pred_res)
-            # [('brand_name', 4), ('goods_name', 3), ('param', 2)]
+            temp_pre_model_res[k] = en_to_zh_map[result]
-            li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
-            print(k, li_sort)
+        # 参数和型号列出现多条相同值则丢弃
-            temp_pre_model_res[k] = en_to_zh_map[li_sort[0][0]]
+        prob_param_and_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数' or temp_pre_model_res[i] == '型号']
-        print(temp_pre_model_res)
+        for col in prob_param_and_gn_cols:
+            if self.is_multi_same(temp_dic_data[col]):
-        pre_model_res = {}
+                temp_pre_model_res.pop(col)
+        model_id_res = {
+            'std_result': temp_pre_model_res,
+            'ab_result': ab_result,
+        }
+        return model_id_res
    def predict(self, dic_data, predict_type='all'):
        dic_data = self.pre_deal(dic_data)
        if predict_type == 'all':
-            pre_id_res = self.pre_predict(dic_data)
+            pre_id_res = self.head_predict(dic_data)
-            if pre_id_res:
+            model_id_res = self.model_predict(dic_data)
-                return pre_id_res
+            # 表头预测和模型预测最后返回的数据进行综合处理
+            pre_std_result = pre_id_res.get('std_result')
+            model_std_result = model_id_res.get('std_result')
+            comprehensive_res = self.get_comprehensive_res(pre_std_result, model_std_result)
+            if comprehensive_res:
+                res = {
+                    'std_result': comprehensive_res,
+                    'ab_result': pre_id_res['ab_result'],
+                }
+                return res
        elif predict_type == 'model':
            model_id_res = self.model_predict(dic_data)
            if model_id_res:
                return model_id_res
-        elif predict_type == 'pre':
-            pre_id_res = self.pre_predict(dic_data)
+        elif predict_type == 'head':
+            pre_id_res = self.head_predict(dic_data)
            if pre_id_res:
                return pre_id_res
@@ -149,7 +170,7 @@ class DicPredict(BasePredictor):
            # 去掉空置率大于等于0.8的列
            counter = 0
            for item in v:
-                if str(item).strip() == PLACEHOLDER:
+                if str(item).strip() == self.PLACEHOLDER:
                    counter += 1
            if counter / len(v) <= NAN_RATE:
                new_dic_data[k] = v
@@ -170,3 +191,23 @@ class DicPredict(BasePredictor):
        else:
            return True
+    def get_comprehensive_res(self, pre_std_result, model_std_result):
+        vote_count = {
+            "类别": [],
+            "参数": [],
+            "型号": [],
+            "数量": [],
+            "品牌": []
+        }
+        for k, v in pre_std_result.items():
+            vote_count[v].append(k)
+        for k, v in model_std_result.items():
+            vote_count[v].append(k)
+        comprehensive_res = {}
+        for std_name, col_li in vote_count.items():
+            if len(col_li) >= 1:
+                col = repeat_max(col_li)
+                comprehensive_res[col] = std_name
+        return comprehensive_res