Commit de3ff5d8 by 陈森彬

新增询报价聊天识别

parent 32ae2048
Showing with 50 additions and 14 deletions
...@@ -160,3 +160,5 @@ attr_regex = "F|H|K|A|W|KW" ...@@ -160,3 +160,5 @@ attr_regex = "F|H|K|A|W|KW"
special_attr_regex = "Ω" special_attr_regex = "Ω"
replace_char = "|/。,或各和::" replace_char = "|/。,或各和::"
interference_tupe = ("HK","week",)
...@@ -16,6 +16,8 @@ pcs_num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(pcs)', re.I) ...@@ -16,6 +16,8 @@ pcs_num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(pcs)', re.I)
price_num_pattern = re.compile('(\d+\.\d+|\d+)') price_num_pattern = re.compile('(\d+\.\d+|\d+)')
zh_pattern = re.compile('[\u4e00-\u9fa5]+') zh_pattern = re.compile('[\u4e00-\u9fa5]+')
usd_num_pattern = re.compile('(\d+\.\d+|\d+)(usd)', re.I) usd_num_pattern = re.compile('(\d+\.\d+|\d+)(usd)', re.I)
time_pattern = re.compile(
'(\s|[\u4e00-\u9fa5])(\d+[-~到至]\d+|\d+|一|二|三|四|五|六|七|八|九|十|两|叁|贰)(日|周|月|weeks|week|天|DAY|工作日)')
prefixchar = [i + j for i in split_char for j in legal_char] prefixchar = [i + j for i in split_char for j in legal_char]
tailchar = [i + j for i in legal_char for j in split_char] tailchar = [i + j for i in legal_char for j in split_char]
prefixchar.extend(list(legal_char)) prefixchar.extend(list(legal_char))
...@@ -373,7 +375,7 @@ def check_k_num(kw_info): ...@@ -373,7 +375,7 @@ def check_k_num(kw_info):
res = str(delete_extra_zero(float(res[0][1]) * 1000)) res = str(delete_extra_zero(float(res[0][1]) * 1000))
else: else:
res = str(int(res[0][1]) * 1000) res = str(int(res[0][1]) * 1000)
return res,res_num return res, res_num
# return res # return res
...@@ -396,11 +398,14 @@ def check_num(kw_info): ...@@ -396,11 +398,14 @@ def check_num(kw_info):
:param kw_info: :param kw_info:
:return: :return:
''' '''
res = num_pattern.findall(kw_info)
res = num_pattern.findall(kw_info.replace(",", ""))
if not res: if not res:
return False return False
else: else:
return res[0][1].strip() if res[0][1][0] != "0":
return res[0][1].strip()
return False
def check_price(kw_info): def check_price(kw_info):
...@@ -445,3 +450,30 @@ def replace_symbol(kw): ...@@ -445,3 +450,30 @@ def replace_symbol(kw):
for replace_info in replace_char: for replace_info in replace_char:
kw = kw.replace(replace_info, " ") kw = kw.replace(replace_info, " ")
return kw return kw
def check_time(kw_info):
'''
匹配货期
:param kw_info:
:return:
'''
res = time_pattern.findall(kw_info)
if res:
return res[0][1].replace("到", "-").replace("至", "-") + res[0][2].replace("工作日", "天")
else:
return False
def check_interference(kw_info):
'''
确认是否包含干扰词
:param kw_info:
:return:
'''
for str_info in interference_tupe:
print(str_info)
if str_info in kw_info:
return False
return True
...@@ -159,9 +159,11 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -159,9 +159,11 @@ class ImHandler(tornado.web.RequestHandler):
data_obj["number"] = "" data_obj["number"] = ""
data_obj["price"] = "" data_obj["price"] = ""
data_obj["encap"] = "" data_obj["encap"] = ""
data_obj["delivery_time"] = ""
new_data_info = [] new_data_info = []
data_info = replace_symbol(data_info).replace("(", "(").replace(")", ")") # 统一分隔符,替换中文括号 # print(data_info.decode("gbk").encode("utf-8"))
data_info = replace_symbol(data_info).replace("(", "(").replace(")", ")").replace("\xa0","").replace("\t"," ") # 统一分隔符,替换中文括号
data_cut_list = data_info.split(" ") data_cut_list = data_info.split(" ")
new_cut_info = "" new_cut_info = ""
for cut_info in data_cut_list: # 第一遍过滤 for cut_info in data_cut_list: # 第一遍过滤
...@@ -174,8 +176,10 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -174,8 +176,10 @@ class ImHandler(tornado.web.RequestHandler):
else: else:
resp1 = self.predic.predict(new_info, 'all') resp1 = self.predic.predict(new_info, 'all')
resp2 = self.predic.predict(new_info, 'brand') resp2 = self.predic.predict(new_info, 'brand')
zh_res = check_zh(new_info) # zh_res = check_zh(new_info)
if resp1['result'] == '品牌' and resp2['result'] and "HK" not in new_info and zh_res: #判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号 if resp1['result'] == '品牌' and resp2[
'result'] and check_zh(new_info) and check_interference(
new_info): # 判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号
data_obj["brand"] = new_info data_obj["brand"] = new_info
# elif resp1['result'] != '品牌' and resp2['result']: # elif resp1['result'] != '品牌' and resp2['result']:
# #
...@@ -188,9 +192,7 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -188,9 +192,7 @@ class ImHandler(tornado.web.RequestHandler):
gn_res = self.predic.predict(new_info, 'gn') gn_res = self.predic.predict(new_info, 'gn')
k_res = check_k_num(" " + new_info) k_res = check_k_num(" " + new_info)
pcs_res = check_pcs_num(" " + new_info) pcs_res = check_pcs_num(" " + new_info)
if gn_res[ if gn_res["result"] and not k_res and not new_info.strip().isdigit() and not pcs_res and check_zh(new_info): # 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
"result"] and not k_res and not new_info.strip().isdigit() and not pcs_res and check_zh(
new_info): # 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
data_obj["gn"].append(new_info) data_obj["gn"].append(new_info)
if data_obj["brand"]: if data_obj["brand"]:
...@@ -224,9 +226,10 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -224,9 +226,10 @@ class ImHandler(tornado.web.RequestHandler):
data_obj["price"] = price_res data_obj["price"] = price_res
if k_res or pcs_res or num_res or price_res: if k_res or pcs_res or num_res or price_res:
new_cut_info = new_cut_info.replace(list_info, "") new_cut_info = new_cut_info.replace(list_info, "")
time_res = check_time(" " + kw_word + " ")
if time_res:
#第一种写法 data_obj["delivery_time"] = time_res
# 第一种写法
# kw_word = " " + list_info # kw_word = " " + list_info
# k_res = check_k_num(kw_word) # k_res = check_k_num(kw_word)
# #
...@@ -255,7 +258,7 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -255,7 +258,7 @@ class ImHandler(tornado.web.RequestHandler):
end_cut_list = new_cut_info.split(" ") end_cut_list = new_cut_info.split(" ")
len_lsit = [] len_lsit = []
for end_cut_info in end_cut_list: # 将剩下的未知单词剔除包含中文的单词 for end_cut_info in end_cut_list: # 将剩下的未知单词剔除包含中文的单词
if end_cut_info.strip() and check_zh(end_cut_info.strip()) and len(end_cut_info.strip()) > 6: if end_cut_info.strip() and check_zh(end_cut_info.strip()) and len(end_cut_info.strip()) > 4:
len_lsit.append(end_cut_info) len_lsit.append(end_cut_info)
if len_lsit and len(len_lsit) == 1 and not data_obj["gn"]: # 若剔除完中文后,此时型号还未空,则剩下的词很大概率是型号,暂时按时型号处理 if len_lsit and len(len_lsit) == 1 and not data_obj["gn"]: # 若剔除完中文后,此时型号还未空,则剩下的词很大概率是型号,暂时按时型号处理
data_obj["gn"].append(len_lsit[0].strip()) data_obj["gn"].append(len_lsit[0].strip())
...@@ -265,7 +268,6 @@ class ImHandler(tornado.web.RequestHandler): ...@@ -265,7 +268,6 @@ class ImHandler(tornado.web.RequestHandler):
res = {} res = {}
res["data"] = data res["data"] = data
# print(res)
self.write(res) self.write(res)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment