Commit b7f946c4 by lichenggang

多个参数列的选择从比较不同率改为比较特征率

parent a9c6c7dd
Showing with 43 additions and 20 deletions
......@@ -61,7 +61,7 @@ class BasePredictor:
# return not self.is_catecol(data)
@classmethod
def is_num(self, data):
def is_num(cls, data):
"""
数量列预测
"""
......@@ -87,7 +87,7 @@ class BasePredictor:
return True if rate >= RIGHT_LEVEL else False
@classmethod
def is_catecol(self, data):
def is_catecol(cls, data):
cates = []
for i in data:
for j in CATEGORY:
......@@ -98,14 +98,14 @@ class BasePredictor:
return rate >= CATE_LEVEL
@classmethod
def is_multi_same(self, data):
no_null_data = list(filter(lambda x: x != self.PLACEHOLDER, data))
def is_multi_same(cls, data):
no_null_data = list(filter(lambda x: x != cls.PLACEHOLDER, data))
result = Counter(no_null_data)
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
return li_sort[0][1] >= MULTI_SAME_LEVEL
@classmethod
def is_seq(self, data):
def is_seq(cls, data):
"""
序号列预测
"""
......@@ -126,7 +126,7 @@ class BasePredictor:
return True if rate >= SEQ_LEVEL else False
@classmethod
def get_diffrate(self, data):
def get_diffrate(cls, data):
"""
得到不同率
"""
......@@ -134,14 +134,14 @@ class BasePredictor:
return rate
@classmethod
def repeat_max(self, li):
def repeat_max(cls, li):
result = Counter(li)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort = sorted(result.items(), key=lambda x: x[1], reverse=True)
return li_sort[0][0]
@classmethod
def get_comprehensive_res(self, head_std_result, model_std_result):
def get_comprehensive_res(cls, head_std_result, model_std_result):
vote_count = {
"类别": [],
......@@ -165,7 +165,7 @@ class BasePredictor:
return comprehensive_res
@classmethod
def is_ref(self, data):
def is_ref(cls, data):
"""
位号列预测
"""
......@@ -177,7 +177,7 @@ class BasePredictor:
return round(count / len(data), 3) >= REF_LEVEL or False
@classmethod
def is_pcs(self, data):
def is_pcs(cls, data):
"""
pcs列
"""
......@@ -188,5 +188,19 @@ class BasePredictor:
count += 1
return round(count / len(data), 3) >= PCS_LEVEL or False
@classmethod
def get_param_featurerate(cls, data):
"""
参数列的特征率
"""
feature_li = ['pf', '%', '±', 'uf']
count = 0
for i in data:
for feature in feature_li:
if feature in str(i).strip():
count += 1
break
return round(count / len(data), 3)
if __name__ == "__main__":
print(BasePredictor.is_num([3400.0, 5920.0, 4849.0, 2544.0, 3270.0, 52751.0, 2031.0, 5302.0, 726.0, 1247.0, 2472.0, 689.0, 6049.0, 26796.0, 6164.0, 1605.0, 4346.0, 640.0, 960.0, 960.0, 320.0, 160.0, 860.0, 160.0, 320.0, 3183.0, 10151.0, 640.0, 130.0, 1237.0, 800.0, 960.0, 3740.0, 17701.0, 2146.0, 1280.0, 160.0, 1120.0, 160.0, 480.0, 960.0, 480.0, 160.0, 4717.0, 160.0, 160.0, 160.0, 640.0, 160.0, 320.0, 160.0, 160.0, 800.0, 800.0, 480.0, 1600.0, 155.0, 960.0, 320.0, 944.0, 160.0, 160.0, 1280.0, 1852.0, 7680.0, 7680.0, 2880.0, 160.0, 224.0, 480.0, 480.0, 640.0, 160.0, 640.0, 320.0, 1760.0, 640.0, 480.0, 960.0, 160.0, 160.0, 160.0, 160.0, 1920.0, 160.0, 5600.0, 480.0, 2560.0, 160.0, 160.0, 160.0, 160.0, 160.0, 1280.0, 160.0, 160.0, 160.0, 160.0, 160.0, 320.0, 0.0, 160.0, 160.0]))
......@@ -89,7 +89,7 @@ class DicPredict(BasePredictor):
temp_pre_model_res = {}
ab_result = {}
for k, v in dic_data.items():
no_null_v = list(filter(lambda x: x != self.PLACEHOLDER, v))
no_null_v = list(filter(lambda x: x != BasePredictor.PLACEHOLDER, v))
bol = self.v_chain(v)
if self.is_seq(no_null_v):
ab_result[k] = '序号'
......@@ -111,7 +111,7 @@ class DicPredict(BasePredictor):
temp_pre_model_res[k] = '类别'
# 对列元素进行去重并处理掉占位符
set_not_null_dic_data = {k: set(list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k]))) for k in
set_not_null_dic_data = {k: set(list(filter(lambda x: x != BasePredictor.PLACEHOLDER, dic_data[k]))) for k in
prob_columns}
for k, v in set_not_null_dic_data.items():
li_single_pred_res = []
......@@ -127,11 +127,11 @@ class DicPredict(BasePredictor):
continue
temp_pre_model_res[k] = EN_TO_ZH_MAP[result]
# 若有多个参数列或型号列,则进行不同率的比较, 不同率最高的选为目标列
not_null_dic_data = {k: list(filter(lambda x: x != self.PLACEHOLDER, dic_data[k])) for k in prob_columns}
prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
# 若有多个型号列,则进行不同率的比较, 不同率最高的选为目标列
not_null_dic_data = {k: list(filter(lambda x: x != BasePredictor.PLACEHOLDER, dic_data[k])) for k in prob_columns}
# prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
prob_gn_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '型号']
for param_or_gn_col_list in [prob_param_cols, prob_gn_cols]:
for param_or_gn_col_list in [prob_gn_cols]:
if len(param_or_gn_col_list) >= 2:
li_diffrate = [(col, BasePredictor.get_diffrate(not_null_dic_data[col])) for col in
param_or_gn_col_list]
......@@ -139,6 +139,15 @@ class DicPredict(BasePredictor):
for col_diffrate in sort_li_diffrate[1:]:
temp_pre_model_res.pop(col_diffrate[0])
# 若有多个参数列,则进行参数特征的数量比较, 特征最多的选为目标列
prob_param_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '参数']
if len(prob_param_cols) >= 2:
li_feature_rate = [(col, BasePredictor.get_param_featurerate(not_null_dic_data[col])) for col in
prob_param_cols]
sort_li_fearate = sorted(li_feature_rate, key=lambda x: x[1], reverse=True)
for col_fearate in sort_li_fearate[1:]:
temp_pre_model_res.pop(col_fearate[0])
# 若有多个数量列,则进行空置率的比较, 空置率最低的选为目标列, #TODO 后续可能需要改成数量元素的占比率
prob_num_cols = [i for i in temp_pre_model_res if temp_pre_model_res[i] == '数量']
if len(prob_num_cols) >= 2:
......@@ -146,7 +155,7 @@ class DicPredict(BasePredictor):
for prob_num_col in prob_num_cols:
counter = 0
for item in dic_data[prob_num_col]:
if str(item).strip() == self.PLACEHOLDER:
if str(item).strip() == BasePredictor.PLACEHOLDER:
counter += 1
li_nullrate.append((prob_num_col, counter / len(dic_data[prob_num_col])))
sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
......@@ -160,7 +169,7 @@ class DicPredict(BasePredictor):
for prob_brand_col in prob_brand_cols:
counter = 0
for item in dic_data[prob_brand_col]:
if str(item).strip() == self.PLACEHOLDER:
if str(item).strip() == BasePredictor.PLACEHOLDER:
counter += 1
li_nullrate.append((prob_brand_col, counter / len(dic_data[prob_brand_col])))
sort_li_nullrate = sorted(li_nullrate, key=lambda x: x[1])
......@@ -197,7 +206,7 @@ class DicPredict(BasePredictor):
head_std_result = head_id_res.get('std_result')
model_std_result = model_id_res.get('std_result')
comprehensive_res = BasePredictor.get_comprehensive_res(head_std_result, model_std_result)
self.info.info('综合预测结果: ' + str(comprehensive_res))
if comprehensive_res:
res = {
'std_result': comprehensive_res,
......@@ -221,7 +230,7 @@ class DicPredict(BasePredictor):
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if str(item).strip() == self.PLACEHOLDER:
if str(item).strip() == BasePredictor.PLACEHOLDER:
counter += 1
if counter / len(v) <= NAN_RATE:
new_dic_data[k] = v
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment