多个参数列的选择从比较不同率改为比较特征率

b7f946c4 · lichenggang · a9c6c7dd · b7f946c4 · b7f946c4
Commit b7f946c4 authored Jun 10, 2020 by lichenggang
Showing with 43 additions and 20 deletions
predict/base_handler.py
predict/dict_predict.py
--- a/predict/base_handler.py
+++ b/predict/base_handler.py
@@ -61,7 +61,7 @@ class BasePredictor:
    #     return not self.is_catecol(data)

    @classmethod
-    def is_num(self, data):
+    def is_num(cls, data):
        """
        数量列预测
        """
@@ -87,7 +87,7 @@ class BasePredictor:
        return True if rate >= RIGHT_LEVEL else False

    @classmethod
-    def is_catecol(self, data):
+    def is_catecol(cls, data):
        cates = []
        for i in data:
            for j in CATEGORY:
@@ -98,14 +98,14 @@ class BasePredictor:
        return rate >= CATE_LEVEL

    @classmethod
-    def is_multi_same(self, data):
-        no_null_data = list(filter(lambda x: x != self.PLACEHOLDER, data))
+    def is_multi_same(cls, data):
+        no_null_data = list(filter(lambda x: x != cls.PLACEHOLDER, data))
        result = Counter(no_null_data)
        li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
        return li_sort[0][1] >= MULTI_SAME_LEVEL

    @classmethod
-    def is_seq(self, data):
+    def is_seq(cls, data):
        """
        序号列预测
        """
@@ -126,7 +126,7 @@ class BasePredictor:
            return True if rate >= SEQ_LEVEL else False

    @classmethod
-    def get_diffrate(self, data):
+    def get_diffrate(cls, data):
        """
        得到不同率
        """
@@ -134,14 +134,14 @@ class BasePredictor:
        return rate

    @classmethod
-    def repeat_max(self, li):
+    def repeat_max(cls, li):
        result = Counter(li)
        # [('brand_name', 4), ('goods_name', 3), ('param', 2)]
        li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
        return li_sort[0][0]

    @classmethod
-    def get_comprehensive_res(self, head_std_result, model_std_result):
+    def get_comprehensive_res(cls, head_std_result, model_std_result):

        vote_count = {
            "类别": [],
@@ -165,7 +165,7 @@ class BasePredictor:
        return comprehensive_res

    @classmethod
-    def is_ref(self, data):
+    def is_ref(cls, data):
        """
        位号列预测
        """
@@ -177,7 +177,7 @@ class BasePredictor:
        return round(count / len(data), 3) >= REF_LEVEL or False

    @classmethod
-    def is_pcs(self, data):
+    def is_pcs(cls, data):
        """
        pcs列
        """
@@ -188,5 +188,19 @@ class BasePredictor:
                count += 1
        return round(count / len(data), 3) >= PCS_LEVEL or False

+    @classmethod
+    def get_param_featurerate(cls, data):
+        """
+        参数列的特征率
+        """
+        feature_li = ['pf', '%', '±', 'uf']
+        count = 0
+        for i in data:
+            for feature in feature_li:
+                if feature in str(i).strip():
+                    count += 1
+                    break
+        return round(count / len(data), 3)
+
 if __name__ == "__main__":
    print(BasePredictor.is_num([3400.0, 5920.0, 4849.0, 2544.0, 3270.0, 52751.0, 2031.0, 5302.0, 726.0, 1247.0, 2472.0, 689.0, 6049.0, 26796.0, 6164.0, 1605.0, 4346.0, 640.0, 960.0, 960.0, 320.0, 160.0, 860.0, 160.0, 320.0, 3183.0, 10151.0, 640.0, 130.0, 1237.0, 800.0, 960.0, 3740.0, 17701.0, 2146.0, 1280.0, 160.0, 1120.0, 160.0, 480.0, 960.0, 480.0, 160.0, 4717.0, 160.0, 160.0, 160.0, 640.0, 160.0, 320.0, 160.0, 160.0, 800.0, 800.0, 480.0, 1600.0, 155.0, 960.0, 320.0, 944.0, 160.0, 160.0, 1280.0, 1852.0, 7680.0, 7680.0, 2880.0, 160.0, 224.0, 480.0, 480.0, 640.0, 160.0, 640.0, 320.0, 1760.0, 640.0, 480.0, 960.0, 160.0, 160.0, 160.0, 160.0, 1920.0, 160.0, 5600.0, 480.0, 2560.0, 160.0, 160.0, 160.0, 160.0, 160.0, 1280.0, 160.0, 160.0, 160.0, 160.0, 160.0, 320.0, 0.0, 160.0, 160.0]))
--- a/predict/dict_predict.py
+++ b/predict/dict_predict.py
@@ -89,7 +89,7 @@ class DicPredict(BasePredictor):
        temp_pre_model_res = {}
        ab_result = {}
        for k, v in dic_data.items():
-            no_null_v = list(filter(lambda x: x != self.PLACEHOLDER, v))
+            no_null_v = list(filter(lambda x: x != BasePredictor.PLACEHOLDER, v))
            bol = self.v_chain(v)
            if self.is_seq(no_null_v):
                ab_result[k] = '序号'
@@ -111,7 +111,7 @@ class DicPredict(BasePredictor):
                temp_pre_model_res[k] = '类别'

        # 对列元素进行去重并处理掉占位符
-        set_not_null_dic_data = {k: set(list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k]))) for k in
+        set_not_null_dic_data = {k: set(list(filter(lambda x: x != BasePredictor.PLACEHOLDER, dic_data[k]))) for k in
                                 prob_columns}
        for k, v in set_not_null_dic_data.items():
            li_single_pred_res = []
@@ -127,11 +127,11 @@ class DicPredict(BasePredictor):
                continue
            temp_pre_model_res[k] = EN_TO_ZH_MAP[result]

-        # 若有多个参数列或型号列,则进行不同率的比较, 不同率最高的选为目标列
-        not_null_dic_data = {k: list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k])) for k in prob_columns}
-        prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
+        # 若有多个型号列,则进行不同率的比较, 不同率最高的选为目标列
+        not_null_dic_data = {k: list(filter(lambda x: x != BasePredictor.PLACEHOLDER, dic_data[k])) for k in prob_columns}
+        # prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
        prob_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '型号']
-        for param_or_gn_col_list in [prob_param_cols, prob_gn_cols]:
+        for param_or_gn_col_list in [prob_gn_cols]:
            if len(param_or_gn_col_list) >= 2:
                li_diffrate = [(col, BasePredictor.get_diffrate(not_null_dic_data[col])) for col in
                               param_or_gn_col_list]
@@ -139,6 +139,15 @@ class DicPredict(BasePredictor):
                for col_diffrate in sort_li_diffrate[1:]:
                    temp_pre_model_res.pop(col_diffrate[0])

+        # 若有多个参数列,则进行参数特征的数量比较, 特征最多的选为目标列
+        prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
+        if len(prob_param_cols) >= 2:
+            li_feature_rate = [(col, BasePredictor.get_param_featurerate(not_null_dic_data[col])) for col in
+                               prob_param_cols]
+            sort_li_fearate = sorted(li_feature_rate, key=lambda x: x[1], reverse=True)
+            for col_fearate in sort_li_fearate[1:]:
+                temp_pre_model_res.pop(col_fearate[0])
+
        # 若有多个数量列,则进行空置率的比较, 空置率最低的选为目标列, #TODO 后续可能需要改成数量元素的占比率
        prob_num_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '数量']
        if len(prob_num_cols) >= 2:
@@ -146,7 +155,7 @@ class DicPredict(BasePredictor):
            for prob_num_col in prob_num_cols:
                counter = 0
                for item in dic_data[prob_num_col]:
-                    if str(item).strip() == self.PLACEHOLDER:
+                    if str(item).strip() == BasePredictor.PLACEHOLDER:
                        counter += 1
                li_nullrate.append((prob_num_col, counter / len(dic_data[prob_num_col])))
            sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
@@ -160,7 +169,7 @@ class DicPredict(BasePredictor):
            for prob_brand_col in prob_brand_cols:
                counter = 0
                for item in dic_data[prob_brand_col]:
-                    if str(item).strip() == self.PLACEHOLDER:
+                    if str(item).strip() == BasePredictor.PLACEHOLDER:
                        counter += 1
                li_nullrate.append((prob_brand_col, counter / len(dic_data[prob_brand_col])))
            sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
@@ -197,7 +206,7 @@ class DicPredict(BasePredictor):
            head_std_result = head_id_res.get('std_result')
            model_std_result = model_id_res.get('std_result')
            comprehensive_res = BasePredictor.get_comprehensive_res(head_std_result, model_std_result)
-
+            self.info.info('综合预测结果: ' + str(comprehensive_res))
            if comprehensive_res:
                res = {
                    'std_result': comprehensive_res,
@@ -221,7 +230,7 @@ class DicPredict(BasePredictor):
            # 去掉空置率大于等于0.8的列
            counter = 0
            for item in v:
-                if str(item).strip() == self.PLACEHOLDER:
+                if str(item).strip() == BasePredictor.PLACEHOLDER:
                    counter += 1
            if counter / len(v) <= NAN_RATE:
                new_dic_data[k] = v