Merge branch 'master' of ssh://119.23.72.7:22611/lic/bom_identify into dev

# Conflicts: # utils/excel_manager.py

Merge branch 'master' of ssh://119.23.72.7:22611/lic/bom_identify into dev
# Conflicts: # utils/excel_manager.py
17dc32e3 · lzzzzl · 76ff0b71 · efc76efc · 17dc32e3 · 17dc32e3
Commit 17dc32e3 authored Jun 01, 2020 by lzzzzl
Showing with 169 additions and 74 deletions
classify_server.py
client/client.py
extractor
model
predic_fac.py
predict/base_handler.py
predict/dict_predict.py
utils/excel_manager.py
--- a/classify_server.py
+++ b/classify_server.py
@@ -18,8 +18,9 @@ class Classify(classify_pb2_grpc.classifyServicer):


    def Classify(self, request, context):
-        print('接收数据: ' + request.keyword)
-        res = self.predictorfac.predict(request.keyword)
+        dic_data = json.loads(request.keyword)
+        print(dic_data)
+        res = self.predictorfac.predict(dic_data)
        return classify_pb2.ClassifyReply(message='result {msg}'.format(msg=res))

    def fac_test_predic(self, data):
@@ -45,4 +46,7 @@ if __name__ == '__main__':
    # data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单')
    # print(data)
    # print(Classify().fac_test_predic(data))
+    for i in range(65, 91):
+        print(chr(i))
+

--- a/client/client.py
+++ b/client/client.py
@@ -4,7 +4,8 @@ from protobuf import classify_pb2_grpc
 import pandas as pd
 import json
 def get_test_data():
-    df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None)
+    df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx', header=None)
+    print(df)
    df.fillna(' ', inplace=True)
    dic_dft = df.to_dict(orient='list')
    return json.dumps(dic_dft)

--- a/extractor
+++ b/extractor
--- a/model
+++ b/model
--- a/predic_fac.py
+++ b/predic_fac.py
@@ -19,10 +19,9 @@ class PredictorFac():
        elif isinstance(data, dict):
            return self.dict_predictor

-    def predict(self, data):
-        dic_data = json.loads(data)
+    def predict(self, dic_data, predict_type='all'):
        predictor = self._get_predictor(dic_data)
-        res = predictor.predict(dic_data)
+        res = predictor.predict(dic_data, predict_type)
        return res


@@ -30,8 +29,8 @@ if __name__ == "__main__":
    def get_test_data():
        import pandas as pd
        import json
-        df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx', header=None)
-        df.fillna(' ', inplace=True)
+        df = pd.read_excel(r'C:\data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx', header=None, sheet_name='1')
+        df.fillna('?', inplace=True)
        dic_dft = df.to_dict(orient='list')
        return json.dumps(dic_dft)

@@ -40,7 +39,8 @@ if __name__ == "__main__":

    data = get_test_data()
    p = PredictorFac(model_config)
-    res = p.predict(data)
+    data = json.loads(data)
+    res = p.predict(data,predict_type='model')
    print(res)


--- a/predict/base_handler.py
+++ b/predict/base_handler.py
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
+import re

 from utils.log_manager import get_logger
 from utils.robots import dd_send_msg
 import pandas as pd

-
+NUMBER_LEVEL = 0.7
+SEQ_LEVEL = 0.5
 class BasePredictor:
    '''
        预测类基类
@@ -20,6 +22,7 @@ class BasePredictor:
        self.robot_msg = dd_send_msg
        self.pd = pd

+
    def predict(self, key):
        raise NotImplementedError

@@ -37,3 +40,47 @@ class BasePredictor:
        deal_list = [round(i, 3) for i in proba[0].tolist()]
        dic_proba = {k: v for k, v in zip(classes, deal_list)}
        return predictions[0], dic_proba
+
+    def isseq(self, data):
+        """
+        序号列预测
+        """
+        collect_seq = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
+        if len(collect_seq) < 1 or not self.isIncrease(collect_seq, len(collect_seq)):
+            return False
+        else:
+            rate = round(len(collect_seq) / len(data), 3)
+            return True if rate >= SEQ_LEVEL else False
+
+    def isnum(self, data):
+        """
+        数量列预测
+        """
+        collect_num = [kw for kw in data if isinstance(kw, int) or self.isNumberCol(kw)]
+        rate = round(len(collect_num) / len(data), 3)
+        return True if rate >= NUMBER_LEVEL else False
+
+    def isIncrease(self, arr, size):
+        """
+        判断列表元素是否递增
+        """
+        if size == 1:
+            return True
+        return (arr[size - 1] >= arr[size - 2]) and self.isIncrease(arr, size - 1)
+
+
+    def isNumberCol(self, kw):
+        """
+        是否是数量列
+        """
+        if isinstance(kw, str):
+            return re.match(r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$', kw, re.M | re.I)
+        else:
+            return False
+
+    def valid_seq(self, data):
+        """取反"""
+        return not self.isseq(data)
+
+    def valid_num(self, data):
+        return not self.isnum(data)
\ No newline at end of file
--- a/predict/dict_predict.py
+++ b/predict/dict_predict.py
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
+from collections import Counter
+
 from predict.base_handler import BasePredictor

 # 可能的头部字段
 prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
-               "需求数量",
+               "需求数量", "售价",
               "参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]

 # 标准名和代名词的映射
 fields_map = {"序号": ["序号"],
              "类别": ["类别", "分类", "名称", "类别名称"],
              "参数": ["参数", "规格", "描述"],
-              "型号": ["型号", "参考料号", "料号", "MPN"],
+              "型号": ["型号", "参考料号", "料号", "MPN", "厂商编码"],
              "数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
              "封装": ["封装", "封装规格"],
-              "品牌": ["品牌", "品牌/厂商", "参考品牌", "厂商编码", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
+              "品牌": ["品牌", "品牌/厂商", "参考品牌", "参考供应商", "厂商", "参考供应商", "参考厂商"]}

+# 必须返回也必须验证的标准字段
+MUST_STD_FIELDS = ['参数', '数量']
 #
 order_list = ['序号']
-
+en_to_zh_map = {'brand_name': '品牌', 'param': '参数', 'goods_name': '型号'}

 def fun(seri):
    li_seri = seri.tolist()
@@ -28,14 +32,18 @@ def fun(seri):


 # 取前多少行
-HEAD_ROW = 5
+HEAD_ROW = 7
+# 空置率阈值
+NAN_RATE = 0.8
+# 占位符
+PLACEHOLDER = '?'


 class DicPredict(BasePredictor):

    def id_by_field(self, df_head):
        """
-        :param df_head: 传入接收数据的头部Dataframe(默认5行)
+        :param df_head: 传入接收数据的头部Dataframe(默认7行)
        :return li_res: 返回结果列表
        """
        series = df_head.apply(fun)
@@ -60,14 +68,6 @@ class DicPredict(BasePredictor):
        columns = []
        li_data = []
        for k, v in dict_data.items():
-
-            # 去掉空置率大于等于0.8的列
-            counter = 0
-            for item in v:
-                if not str(item).strip():
-                    counter += 1
-            if counter / len(v) >= 0.8:
-                continue
            columns.append(k)
            li_data.append(v)

@@ -84,31 +84,89 @@ class DicPredict(BasePredictor):
            else:
                dic_ab_res = {i['column_name']: i['pronoun']}
                ab_result.append(dic_ab_res)
-        id_res = {
+        pre_id_res = {
            'std_result': std_result,
            'ab_result': ab_result,
        }
-        return id_res
-
-    def predict(self, dic_data):
-        res = self.pre_predict(dic_data)
-        if res:
-            return res
-        if len(dic_data) > 0:
-            self.order_predict(dic_data[0])
-        pass
-
-    def order_predict(self, data):
-        collect_num = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
-        judge = self.IsIncrease(collect_num, len(collect_num))
-        print('judge: ' + str(judge))
-        return judge
-
-    """
-    判断列表元素是否递增
-    """
-
-    def IsIncrease(self, arr, size):
-        if size == 1:
+        return pre_id_res
+
+    def model_predict(self, dic_data):
+        """
+        该方法目前只对[参数, 型号, 数量, 品牌]进行预测
+        :param dic_data:
+        :return:
+        """
+        print(dic_data)
+        prob_columns = []
+        temp_pre_model_res = {}
+
+        for k, v in dic_data.items():
+            bol = self.v_chain(v)
+            if bol:
+                print(k, bol)
+                prob_columns.append(k)
+                continue
+            if self.isnum(v):
+                temp_pre_model_res[k] = '数量'
+                continue
+            if self.isseq(v):
+                temp_pre_model_res[k] = '序号'
+        temp_dic_data = {k: list(filter(lambda x: x != PLACEHOLDER, dic_data[k]))for k in prob_columns}
+        for k, v in temp_dic_data.items():
+            li_single_pred_res = []
+            for string in v:
+                single_pred_res, probdic = self.get_single_predict(string)
+                li_single_pred_res.append(single_pred_res)
+            result = Counter(li_single_pred_res)
+            # [('brand_name', 4), ('goods_name', 3), ('param', 2)]
+            li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
+            print(k, li_sort)
+            temp_pre_model_res[k] = en_to_zh_map[li_sort[0][0]]
+        print(temp_pre_model_res)
+
+        pre_model_res = {}
+
+    def predict(self, dic_data, predict_type='all'):
+        dic_data = self.pre_deal(dic_data)
+
+        if predict_type == 'all':
+            pre_id_res = self.pre_predict(dic_data)
+            if pre_id_res:
+                return pre_id_res
+
+        elif predict_type == 'model':
+            model_id_res = self.model_predict(dic_data)
+            if model_id_res:
+                return model_id_res
+        elif predict_type == 'pre':
+            pre_id_res = self.pre_predict(dic_data)
+            if pre_id_res:
+                return pre_id_res
+
+    def pre_deal(self, dic_data):
+        new_dic_data = {}
+        for k, v in dic_data.items():
+            # 去掉空置率大于等于0.8的列
+            counter = 0
+            for item in v:
+                if str(item).strip() == PLACEHOLDER:
+                    counter += 1
+            if counter / len(v) <= NAN_RATE:
+                new_dic_data[k] = v
+
+        return new_dic_data
+
+    def v_chain(self, li):
+        """
+        验证链,验证方法中某个环节返回了False则返回False
+        :param li:
+        :return:
+        """
+        for fun_name in dir(self):
+            if fun_name.startswith('valid_'):
+                fun = getattr(self, fun_name)
+                if not fun(li):
+                    return False
+        else:
            return True
-        return (arr[size - 1] >= arr[size - 2]) and self.IsIncrease(arr, size - 1)
+
--- a/utils/excel_manager.py
+++ b/utils/excel_manager.py
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
-import pandas as pd
-
-
-# def read_from_excel(file_name, sheet_name):
-#     wb = xlrd.open_workbook(file_name)
-#     sheet = wb.sheet_by_name(sheet_name)
-#     row = sheet.nrows
-#     col = sheet.ncols
-#     result_dict = {}
-#     for i in range(col):
-#         col_list = []
-#         for j in range(row): col_list.append(sheet.cell_value(j, i))
-#         result_dict[i] = col_list
-#     return result_dict
-
-
-def read_from_excel(file_name, sheet_name):
-    data = pd.read_excel(file_name, sheet_name=sheet_name, keep_default_na=False, header=-1)
-    result_dict = {}
-    count = 0
-    for index in data.columns:
-        result_dict[count] = data[index].tolist()
-        count += 1
-    return result_dict
-
-
+l=[' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', '不需要报价', ' ', ' ', ' ', '不需要报价', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
+for v in l:
+    # 去掉空置率大于等于0.8的列
+    counter = 0
+    for item in v:
+        if str(item).strip() == PLACEHOLDER:
+            counter += 1
+    if counter / len(v) <= NAN_RATE:
+        new_dic_data[k] = v
\ No newline at end of file