Commit 1bed00a1 by 陈森彬

新增询报价聊天识别

parent 5b14b1b3
......@@ -39,7 +39,7 @@
- 启动单个字符串验证的服务: python3 valid_server.py
### 所在目录
测试环境:
IP: 192.168.2.232
IP: 192.168.1.252
PATH: /home/lic/bom_identify
正式环境:
......
......@@ -15,6 +15,23 @@ else:
model_config['modextr_path'] = project_path + r'/' + 'models_and_extractors/'
redis_config = {'host': '127.0.0.1', 'port': 6379, 'decode_responses': True, 'password': 'icDb29mLy2s'}
MG_HOST_SET = {
'test': '192.168.1.237',
'produce': '172.18.137.23'
}
def get_mongo_conf():
host = MG_HOST_SET[ENVIRONMENT]
psd = 'huntmon66499' if ENVIRONMENT == 'produce' else 'huntmon6699'
conf = {
"host": host,
"port": 27017,
"database": 'ichunt',
"user": 'ichunt',
"password": psd
}
return conf
unit_map = {
'Ω': {'μΩ': 0.000001,
......@@ -124,7 +141,7 @@ encap_list = ["0030", "008004", "01005", "015008", "0201", "02016", "0202", "020
temp_map = ["C0G", "NP0", "COG", "NPO", "X7R", "X5R", "Y5V", "X6S", "X7S", "X7T", "SL", "U2J", "UJ", "X7U", "X8R",
"Z5U", "C0H", "COH", "U2K", "X6T", "X8G", "X8L", "Y5R", "Y5U", "ZLM"]
unit_regex = "μΩ|uΩ|mΩ|Ω|kΩ|KΩ|MΩ|GΩ|TΩ|pF|PF|Pf|pf|nF|NF|Nf|nf|µF|μF|uF|UF|Uf|uf|mF|MF|Mf|mf|F|pH|Ph|PH|ph|nH|µH|UH|μh|uh|Uh|uH|mH|Mh|MH|mh|H|mA|A|a|V|v|kV|Kv|kv|KV|W|w|kW|kw|KW|Kw|%|毫欧|欧姆|欧|千欧|兆欧|伏特|伏|千伏|瓦特|瓦"
unit_regex = "μΩ|uΩ|mΩ|Ω|kΩ|KΩ|MΩ|GΩ|TΩ|f|pF|PF|Pf|pf|nF|NF|Nf|nf|µF|μF|uF|UF|Uf|uf|mF|MF|Mf|mf|F|pH|Ph|PH|ph|nH|µH|UH|μh|uh|Uh|uH|mH|Mh|MH|mh|H|mA|A|a|V|v|kV|Kv|kv|KV|W|w|kW|kw|KW|Kw|%|毫欧|欧姆|欧|千欧|兆欧|伏特|伏|千伏|瓦特|瓦"
r_regex = "Rr"
......@@ -136,8 +153,10 @@ split_char = '|,,/ '
special_tuple = ("-",)
special_str = "~!@#$%^&*()_+-*/<>,.。,[]\/"
special_str = "~!@#$%^&*()_+*/<>,.。,[]\/"
attr_regex = "F|H|K|A|W|KW"
special_attr_regex = "Ω"
replace_char = "|/。,或各和::"
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
from config import unit_map, cast_map, unit_regex, special_tuple, attr_regex, temp_map, special_str, encap_regex, \
r_regex, legal_char, split_char, special_attr_regex
from predict.kw_predict import KwPredict
from config import *
from utils.mongo_opera import MongoOperator
unit_pattern = re.compile('([.a-zA-Z\\-]?)(\d+|\d+[\\\./?]\d+)(' + unit_regex + ')([.a-zA-Z\\-]?)')
attr_pattern = re.compile('(\d+|\.\d+)(' + attr_regex + ')('+ special_attr_regex + ')')
attr_pattern = re.compile('(\d+|\.\d+)(' + attr_regex + ')(' + special_attr_regex + ')')
r_pattern = re.compile(
'([.a-zA-Z\\-]?)(\d+[' + r_regex + ']\d+|[' + r_regex + ']\d+|\d+[' + r_regex + '])([.a-zA-Z\\-]?)')
encap_pattern = re.compile('(.{0,2})(' + encap_regex + ')(.{0,2})', re.I)
num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(\s|个|片)')
k_num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(K)', re.I)
pcs_num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(pcs)', re.I)
price_num_pattern = re.compile('(\d+\.\d+|\d+)')
zh_pattern = re.compile('[\u4e00-\u9fa5]+')
usd_num_pattern = re.compile('(\d+\.\d+|\d+)(usd)', re.I)
prefixchar = [i + j for i in split_char for j in legal_char]
tailchar = [i + j for i in legal_char for j in split_char]
prefixchar.extend(list(legal_char))
......@@ -16,6 +23,33 @@ tailchar.extend(list(split_char))
prefixchar.append('')
tailchar.append('')
mongo_op = MongoOperator()
def predict_gn_param(predict: KwPredict, kw) -> int:
'''
传入预测类实例和关键字,返回是否是型号名或参数
:param predict: 预测类实例
:param kw: 关键字
:return: 1|0
'''
pat = re.compile(r'(?:.*\-.*|\d+)')
if len(kw) <= 1:
return 0
elif pat.findall(kw):
return 1
else:
res1 = predict.predict(kw, 'gn')
res2 = predict.predict(kw, 'param')
if res1['result'] == 1 or res2['result'] == 1:
return 1
else:
return 0
predic = KwPredict('validSingle')
def is_float(s):
xiaoshu_new = str(s)
......@@ -134,7 +168,7 @@ def unit_conversion(unit_res, kw):
if "/" not in kw: # 单独处理除号
if "%" in kw: # 百分号的单独处理
if "-" in unit_str:
unit_str = unit_str.replace("-","")
unit_str = unit_str.replace("-", "")
unit_num = int(kw.replace(unit_str, "").replace("-", ""))
else:
if "." in kw:
......@@ -321,3 +355,93 @@ def check_attr(kw_info):
if res[0][0] and res[0][1] and "." not in res[0][0] and not res[0][2]:
return True
return False
def check_k_num(kw_info):
'''
匹配包含带K的数量
:param kw_info:
:return:
'''
res = k_num_pattern.findall(kw_info)
if not res:
return False
else:
res_num = res[0][1] + res[0][2]
if "." in res[0][1]:
res = str(delete_extra_zero(float(res[0][1]) * 1000))
else:
res = str(int(res[0][1]) * 1000)
return res,res_num
# return res
def check_pcs_num(kw_info):
'''
匹配包含pcs的数量
:param kw_info:
:return:
'''
res = pcs_num_pattern.findall(kw_info)
if not res:
return False
else:
return res[0][1].strip()
def check_num(kw_info):
'''
匹配纯数字的情况
:param kw_info:
:return:
'''
res = num_pattern.findall(kw_info)
if not res:
return False
else:
return res[0][1].strip()
def check_price(kw_info):
'''
匹配出价格
:param kw_info:
:return:
'''
if "价" in kw_info or "税" in kw_info:
res = price_num_pattern.findall(kw_info)
if res:
return res[0].strip()
else:
return False
else:
res = usd_num_pattern.findall(kw_info)
if res:
return res[0][0].strip()
else:
return False
def check_zh(kw_info):
'''
确认是否包含中文
:param kw_info:
:return:
'''
res = zh_pattern.findall(kw_info)
if not res:
return True
else:
return False
def replace_symbol(kw):
'''
将特殊符号换成空格,统一分隔符
:param kw:
:return:
'''
for replace_info in replace_char:
kw = kw.replace(replace_info, " ")
return kw
......@@ -27,6 +27,7 @@ class StatusCode:
"100003": ("no qty", "未检测到数量"),
"100004": ("no param or gn", "未检测到参数/型号列"),
"100005": ("not yet included", "中文映射还未收录"),
"100006": ("not origin", "没有请求来源"),
}
......
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import json
from urllib.parse import unquote
import tornado.web
import tornado.ioloop
from utils.functions import pre_judge, word_conversion, cut_params, check_param, unit_conversion, get_not_exist_list, \
check_encap, check_temp, check_symbol, check_param_r, check_param_k, check_attr
from utils.functions import *
from utils.redis_cli import redis_cli
from predict.kw_predict import KwPredict
from utils.status import code2msg
import re
def predict_gn_param(predict: KwPredict, kw) -> int:
'''
传入预测类实例和关键字,返回是否是型号名或参数
:param predict: 预测类实例
:param kw: 关键字
:return: 1|0
'''
pat = re.compile(r'(?:.*\-.*|\d+)')
if len(kw) <= 1:
return 0
elif pat.findall(kw):
return 1
else:
res1 = predict.predict(kw, 'gn')
res2 = predict.predict(kw, 'param')
if res1['result'] == 1 or res2['result'] == 1:
return 1
else:
return 0
class KwHandler(tornado.web.RequestHandler):
pat = re.compile(r'(?:.*\-.*|\d+)')
......@@ -152,11 +125,147 @@ class UCHandler(tornado.web.RequestHandler):
words2_list.append(kw_info)
res['status'] = 1
res['words'] = list(set(words_list)) #bom使用
res['words'] = list(set(words_list)) # bom使用
res['attrs'] = list(set(attrs_list))
res['encap'] = list(set(encap_list))
res['words2'] = list(set(words2_list)) #words2 前台搜索使用
print(res)
res['words2'] = list(set(words2_list)) # words2 前台搜索使用
# print(res)
self.write(res)
class ImHandler(tornado.web.RequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.predic = KwPredict('validSingle')
self.redis_cli = redis_cli
self.num_pattern = re.compile('') # 单位匹配
async def post(self):
data_list = json.loads(self.request.body)
if not data_list:
code = '100002'
res = code2msg(code)
elif type(data_list) != list:
code = '100001'
res = code2msg(code)
else:
brand_set = redis_cli.smembers('brand_set')
data = []
for data_info in data_list:
data_obj = {}
data_obj["gn"] = []
data_obj["brand"] = ""
data_obj["number"] = ""
data_obj["price"] = ""
data_obj["encap"] = ""
new_data_info = []
data_info = replace_symbol(data_info).replace("(", "(").replace(")", ")") # 统一分隔符,替换中文括号
data_cut_list = data_info.split(" ")
new_cut_info = ""
for cut_info in data_cut_list: # 第一遍过滤
if cut_info.strip():
new_data_info.append(cut_info)
new_cut_info += cut_info + " "
for new_info in new_data_info:
if new_info in brand_set:
data_obj["brand"] = new_info
else:
resp1 = self.predic.predict(new_info, 'all')
resp2 = self.predic.predict(new_info, 'brand')
zh_res = check_zh(new_info)
if resp1['result'] == '品牌' and resp2['result'] and "HK" not in new_info and zh_res: #判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号
data_obj["brand"] = new_info
# elif resp1['result'] != '品牌' and resp2['result']:
#
# elif resp1['result'] == '品牌' and not resp2['result']:
else:
encap_res = check_encap(new_info) # 确认是不是封装,只匹配标准封装
if encap_res:
data_obj["encap"] = new_info
else:
gn_res = self.predic.predict(new_info, 'gn')
k_res = check_k_num(" " + new_info)
pcs_res = check_pcs_num(" " + new_info)
if gn_res[
"result"] and not k_res and not new_info.strip().isdigit() and not pcs_res and check_zh(
new_info): # 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
data_obj["gn"].append(new_info)
if data_obj["brand"]:
new_cut_info = new_cut_info.replace(data_obj["brand"], "", 1)
if data_obj["encap"]:
new_cut_info = new_cut_info.replace(data_obj["encap"], "")
if data_obj["gn"]:
for gn in data_obj["gn"]:
new_cut_info = new_cut_info.replace(gn, "")
new_cut_list = new_cut_info.split(" ")
for list_info in new_cut_list: # 数量价格提取
if list_info.strip():
kw_word = " " + list_info
k_res = check_k_num(kw_word)
if k_res:
data_obj["number"] = k_res[0]
kw_word = kw_word.replace(k_res[1], "")
pcs_res = check_pcs_num(kw_word)
if pcs_res and not data_obj["number"]:
data_obj["number"] = pcs_res
kw_word = kw_word.replace(pcs_res, "")
num_res = check_num(kw_word + " ")
if num_res and not data_obj["number"]:
data_obj["number"] = num_res
kw_word = kw_word.replace(num_res, "")
price_res = check_price(kw_word + " ")
if price_res:
data_obj["price"] = price_res
if k_res or pcs_res or num_res or price_res:
new_cut_info = new_cut_info.replace(list_info, "")
#第一种写法
# kw_word = " " + list_info
# k_res = check_k_num(kw_word)
#
# if k_res:
# data_obj["number"] = k_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
#
# pcs_res = check_pcs_num(kw_word)
# if pcs_res:
# data_obj["number"] = pcs_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
# kw_word = kw_word + " "
# num_res = check_num(kw_word)
# print(num_res,kw_word)
# if num_res and list_info != new_data_info[0]:
# data_obj["number"] = num_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
# price_res = check_price(kw_word)
# if price_res and not data_obj["price"]:
# data_obj["price"] = price_res
# new_cut_info = new_cut_info.replace(list_info, "")
if new_cut_info.strip():
end_cut_list = new_cut_info.split(" ")
len_lsit = []
for end_cut_info in end_cut_list: # 将剩下的未知单词剔除包含中文的单词
if end_cut_info.strip() and check_zh(end_cut_info.strip()) and len(end_cut_info.strip()) > 6:
len_lsit.append(end_cut_info)
if len_lsit and len(len_lsit) == 1 and not data_obj["gn"]: # 若剔除完中文后,此时型号还未空,则剩下的词很大概率是型号,暂时按时型号处理
data_obj["gn"].append(len_lsit[0].strip())
if data_obj:
data.append(data_obj)
# print(data_obj)
res = {}
res["data"] = data
# print(res)
self.write(res)
......@@ -166,6 +275,7 @@ def gen_app():
register_tornado_handlers = {'/keyword': KwHandler,
'/unit_conversion': UCHandler,
'/identify_meaning': ImHandler,
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment