Commit 76ff0b71 by lzzzzl

添加重复列识别

parents 6c386050 273f2f67
from concurrent import futures
import time
import time, json
import grpc
from protobuf import classify_pb2
from protobuf import classify_pb2_grpc
from utils.config import model_config
from utils.predic_fac import PredictorFac
from predic_fac import PredictorFac
from utils.log_manager import get_logger
from utils.excel_manager import read_from_excel
log_server = get_logger('server')
......@@ -17,13 +16,14 @@ class Classify(classify_pb2_grpc.classifyServicer):
log_server.info('classify server start!')
self.predictorfac = PredictorFac(model_config)
def Classify(self, request, context):
log_server.debug('接收数据: ' + request.keyword)
res = self.predictorfac.predic(request.keyword)
print('接收数据: ' + request.keyword)
res = self.predictorfac.predict(request.keyword)
return classify_pb2.ClassifyReply(message='result {msg}'.format(msg=res))
def fac_test_predic(self, data):
res = self.predictorfac.predic(data)
res = self.predictorfac.predict(data)
return res
......@@ -41,7 +41,8 @@ def serve():
if __name__ == '__main__':
# serve()
data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单')
print(Classify().fac_test_predic(data))
serve()
# data = read_from_excel('DZ0901_V1.4_BOM.xlsx', 'DZ0901_V1.3BOM清单')
# print(data)
# print(Classify().fac_test_predic(data))
import grpc
from protobuf import classify_pb2
from protobuf import classify_pb2_grpc
import pandas as pd
import json
def get_test_data():
df=pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx',header=None)
df.fillna(' ', inplace=True)
dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft)
def run():
# 连接 rpc 服务器
channel = grpc.insecure_channel('localhost:50051')
# 调用 rpc 服务
stub = classify_pb2_grpc.classifyStub(channel)
response = stub.Classify(classify_pb2.ClassifyRequest(keyword='czl'))
test = get_test_data()
response = stub.Classify(classify_pb2.ClassifyRequest(keyword=test))
print("Classify client received: " + response.message)
if __name__ == '__main__':
......
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from sklearn.externals import joblib
import pickle, json
from predict import dict_predict, kw_predict
class PredictorFac():
def __init__(self, config):
with open(config['extractor_path'], 'rb') as f:
extractor = pickle.load(f)
classifier = joblib.load(config['model_path'])
self.kw_predictor = kw_predict.KwPredict('single', extractor, classifier)
self.dict_predictor = dict_predict.DicPredict('dict', extractor, classifier)
def _get_predictor(self, data):
if isinstance(data, str):
return self.kw_predictor
elif isinstance(data, dict):
return self.dict_predictor
def predict(self, data):
dic_data = json.loads(data)
predictor = self._get_predictor(dic_data)
res = predictor.predict(dic_data)
return res
if __name__ == "__main__":
def get_test_data():
import pandas as pd
import json
df = pd.read_excel(r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx', header=None)
df.fillna(' ', inplace=True)
dic_dft = df.to_dict(orient='list')
return json.dumps(dic_dft)
from utils.config import model_config
data = get_test_data()
p = PredictorFac(model_config)
res = p.predict(data)
print(res)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__all__ = ['kw_predict', 'list_predict']
\ No newline at end of file
__all__ = ['kw_predict', 'dict_predict.py']
\ No newline at end of file
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from predict.base_handler import BasePredictor
# 可能的头部字段
prob_fields = ["序号", "名称", "规格", "MPN", "用量(pcs)", "用量", "pcs", "位号", "描述", "值", "数量", "封装", "类别", "a面位置", "b面位置", "备注",
"需求数量",
"参考品牌", "品牌", "item", "厂商编码", "品牌/厂商", "参考料号", "参考供应商", "top面", "bottom面"]
# 标准名和代名词的映射
fields_map = {"序号": ["序号"],
"类别": ["类别", "分类", "名称", "类别名称"],
"参数": ["参数", "规格", "描述"],
"型号": ["型号", "参考料号", "料号", "MPN"],
"数量": ["数量", "用量(pcs)", "PCS", "用量", "用量(PCS)", "pcs"],
"封装": ["封装", "封装规格"],
"品牌": ["品牌", "品牌/厂商", "参考品牌", "厂商编码", "参考供应商", "厂商", "参考供应商", "参考厂商"]}
#
order_list = ['序号']
def fun(seri):
li_seri = seri.tolist()
for field in li_seri:
if str(field).lower() in prob_fields:
return field, seri.name
# 取前多少行
HEAD_ROW = 5
class DicPredict(BasePredictor):
def id_by_field(self, df_head):
"""
:param df_head: 传入接收数据的头部Dataframe(默认5行)
:return li_res: 返回结果列表
"""
series = df_head.apply(fun)
series.dropna(inplace=True)
li_fie = series.tolist()
li_res = []
for field, column_name in li_fie:
dic = {
'std_name': '',
'pronoun': field,
'column_name': column_name
}
for k, v in fields_map.items():
if field.lower() in fields_map[k]:
dic['std_name'] = k
li_res.append(dic)
return li_res
def pre_predict(self, dict_data):
columns = []
li_data = []
for k, v in dict_data.items():
# 去掉空置率大于等于0.8的列
counter = 0
for item in v:
if not str(item).strip():
counter += 1
if counter / len(v) >= 0.8:
continue
columns.append(k)
li_data.append(v)
df = self.pd.DataFrame(li_data)
dft = df.T.head(HEAD_ROW)
dft.columns = columns
li_res_raw = self.id_by_field(dft)
std_result = []
ab_result = []
for i in li_res_raw:
if i.get('std_name'):
dic_has_res = {i['column_name']: i['std_name']}
std_result.append(dic_has_res)
else:
dic_ab_res = {i['column_name']: i['pronoun']}
ab_result.append(dic_ab_res)
id_res = {
'std_result': std_result,
'ab_result': ab_result,
}
return id_res
def predict(self, dic_data):
res = self.pre_predict(dic_data)
if res:
return res
if len(dic_data) > 0:
self.order_predict(dic_data[0])
pass
def order_predict(self, data):
collect_num = [int(kw) for kw in data if isinstance(kw, float) or isinstance(kw, int)]
judge = self.IsIncrease(collect_num, len(collect_num))
print('judge: ' + str(judge))
return judge
"""
判断列表元素是否递增
"""
def IsIncrease(self, arr, size):
if size == 1:
return True
return (arr[size - 1] >= arr[size - 2]) and self.IsIncrease(arr, size - 1)
......@@ -5,6 +5,7 @@ from predict.base_handler import BasePredictor
RIGHT_LEVEL = 0.7
REPEAT_TIMES = 3
class LiPredict(BasePredictor):
......@@ -50,5 +51,22 @@ class LiPredict(BasePredictor):
else:
return False
def isRepeat(self, data):
repeat_dict = {}
for kw in data:
if repeat_dict.get(kw):
repeat_dict[kw] += 1
else:
repeat_dict[kw] = 1
print(repeat_dict)
#
# """
# 判断是否重复列
# """
# def vailed(self, data):
......@@ -23,7 +23,6 @@ def read_from_excel(file_name, sheet_name):
for index in data.columns:
result_dict[count] = data[index].tolist()
count += 1
print(result_dict)
return result_dict
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from sklearn.externals import joblib
import pickle
from predict import *
class PredictorFac():
def __init__(self, config):
with open(config['extractor_path'], 'rb') as f:
extractor = pickle.load(f)
classifier = joblib.load(config['model_path'])
self.kw_predictor = kw_predict.KwPredict('single', extractor, classifier)
self.list_predictor = list_predict.LiPredict('list', extractor, classifier)
def _get_predictor(self, data):
if isinstance(data, str):
return self.kw_predictor
elif isinstance(data, dict):
return self.list_predictor
def predic(self, data):
predictor = self._get_predictor(data)
res = predictor.predict(data)
return res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment