Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
lichenggang
/
bom_identify
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
de3ff5d8
authored
Aug 27, 2021
by
陈森彬
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
新增询报价聊天识别
parent
32ae2048
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
13 deletions
config.py
utils/functions.py
valid_server.py
config.py
View file @
de3ff5d8
...
...
@@ -160,3 +160,5 @@ attr_regex = "F|H|K|A|W|KW"
special_attr_regex
=
"Ω"
replace_char
=
"|/。,或各和::"
interference_tupe
=
(
"HK"
,
"week"
,)
utils/functions.py
View file @
de3ff5d8
...
...
@@ -16,6 +16,8 @@ pcs_num_pattern = re.compile('(\s|[\u4e00-\u9fa5])(\d+\.\d+|\d+)(pcs)', re.I)
price_num_pattern
=
re
.
compile
(
'(
\
d+
\
.
\
d+|
\
d+)'
)
zh_pattern
=
re
.
compile
(
'[
\u4e00
-
\u9fa5
]+'
)
usd_num_pattern
=
re
.
compile
(
'(
\
d+
\
.
\
d+|
\
d+)(usd)'
,
re
.
I
)
time_pattern
=
re
.
compile
(
'(
\
s|[
\u4e00
-
\u9fa5
])(
\
d+[-~到至]
\
d+|
\
d+|一|二|三|四|五|六|七|八|九|十|两|叁|贰)(日|周|月|weeks|week|天|DAY|工作日)'
)
prefixchar
=
[
i
+
j
for
i
in
split_char
for
j
in
legal_char
]
tailchar
=
[
i
+
j
for
i
in
legal_char
for
j
in
split_char
]
prefixchar
.
extend
(
list
(
legal_char
))
...
...
@@ -373,7 +375,7 @@ def check_k_num(kw_info):
res
=
str
(
delete_extra_zero
(
float
(
res
[
0
][
1
])
*
1000
))
else
:
res
=
str
(
int
(
res
[
0
][
1
])
*
1000
)
return
res
,
res_num
return
res
,
res_num
# return res
...
...
@@ -396,11 +398,14 @@ def check_num(kw_info):
:param kw_info:
:return:
'''
res
=
num_pattern
.
findall
(
kw_info
)
res
=
num_pattern
.
findall
(
kw_info
.
replace
(
","
,
""
))
if
not
res
:
return
False
else
:
if
res
[
0
][
1
][
0
]
!=
"0"
:
return
res
[
0
][
1
]
.
strip
()
return
False
def
check_price
(
kw_info
):
...
...
@@ -445,3 +450,30 @@ def replace_symbol(kw):
for
replace_info
in
replace_char
:
kw
=
kw
.
replace
(
replace_info
,
" "
)
return
kw
def
check_time
(
kw_info
):
'''
匹配货期
:param kw_info:
:return:
'''
res
=
time_pattern
.
findall
(
kw_info
)
if
res
:
return
res
[
0
][
1
]
.
replace
(
"到"
,
"-"
)
.
replace
(
"至"
,
"-"
)
+
res
[
0
][
2
]
.
replace
(
"工作日"
,
"天"
)
else
:
return
False
def
check_interference
(
kw_info
):
'''
确认是否包含干扰词
:param kw_info:
:return:
'''
for
str_info
in
interference_tupe
:
print
(
str_info
)
if
str_info
in
kw_info
:
return
False
return
True
valid_server.py
View file @
de3ff5d8
...
...
@@ -159,9 +159,11 @@ class ImHandler(tornado.web.RequestHandler):
data_obj
[
"number"
]
=
""
data_obj
[
"price"
]
=
""
data_obj
[
"encap"
]
=
""
data_obj
[
"delivery_time"
]
=
""
new_data_info
=
[]
data_info
=
replace_symbol
(
data_info
)
.
replace
(
"("
,
"("
)
.
replace
(
")"
,
")"
)
# 统一分隔符,替换中文括号
# print(data_info.decode("gbk").encode("utf-8"))
data_info
=
replace_symbol
(
data_info
)
.
replace
(
"("
,
"("
)
.
replace
(
")"
,
")"
)
.
replace
(
"
\xa0
"
,
""
)
.
replace
(
"
\t
"
,
" "
)
# 统一分隔符,替换中文括号
data_cut_list
=
data_info
.
split
(
" "
)
new_cut_info
=
""
for
cut_info
in
data_cut_list
:
# 第一遍过滤
...
...
@@ -174,8 +176,10 @@ class ImHandler(tornado.web.RequestHandler):
else
:
resp1
=
self
.
predic
.
predict
(
new_info
,
'all'
)
resp2
=
self
.
predic
.
predict
(
new_info
,
'brand'
)
zh_res
=
check_zh
(
new_info
)
if
resp1
[
'result'
]
==
'品牌'
and
resp2
[
'result'
]
and
"HK"
not
in
new_info
and
zh_res
:
#判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号
# zh_res = check_zh(new_info)
if
resp1
[
'result'
]
==
'品牌'
and
resp2
[
'result'
]
and
check_zh
(
new_info
)
and
check_interference
(
new_info
):
# 判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号
data_obj
[
"brand"
]
=
new_info
# elif resp1['result'] != '品牌' and resp2['result']:
#
...
...
@@ -188,9 +192,7 @@ class ImHandler(tornado.web.RequestHandler):
gn_res
=
self
.
predic
.
predict
(
new_info
,
'gn'
)
k_res
=
check_k_num
(
" "
+
new_info
)
pcs_res
=
check_pcs_num
(
" "
+
new_info
)
if
gn_res
[
"result"
]
and
not
k_res
and
not
new_info
.
strip
()
.
isdigit
()
and
not
pcs_res
and
check_zh
(
new_info
):
# 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
if
gn_res
[
"result"
]
and
not
k_res
and
not
new_info
.
strip
()
.
isdigit
()
and
not
pcs_res
and
check_zh
(
new_info
):
# 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
data_obj
[
"gn"
]
.
append
(
new_info
)
if
data_obj
[
"brand"
]:
...
...
@@ -224,9 +226,10 @@ class ImHandler(tornado.web.RequestHandler):
data_obj
[
"price"
]
=
price_res
if
k_res
or
pcs_res
or
num_res
or
price_res
:
new_cut_info
=
new_cut_info
.
replace
(
list_info
,
""
)
#第一种写法
time_res
=
check_time
(
" "
+
kw_word
+
" "
)
if
time_res
:
data_obj
[
"delivery_time"
]
=
time_res
# 第一种写法
# kw_word = " " + list_info
# k_res = check_k_num(kw_word)
#
...
...
@@ -255,7 +258,7 @@ class ImHandler(tornado.web.RequestHandler):
end_cut_list
=
new_cut_info
.
split
(
" "
)
len_lsit
=
[]
for
end_cut_info
in
end_cut_list
:
# 将剩下的未知单词剔除包含中文的单词
if
end_cut_info
.
strip
()
and
check_zh
(
end_cut_info
.
strip
())
and
len
(
end_cut_info
.
strip
())
>
6
:
if
end_cut_info
.
strip
()
and
check_zh
(
end_cut_info
.
strip
())
and
len
(
end_cut_info
.
strip
())
>
4
:
len_lsit
.
append
(
end_cut_info
)
if
len_lsit
and
len
(
len_lsit
)
==
1
and
not
data_obj
[
"gn"
]:
# 若剔除完中文后,此时型号还未空,则剩下的词很大概率是型号,暂时按时型号处理
data_obj
[
"gn"
]
.
append
(
len_lsit
[
0
]
.
strip
())
...
...
@@ -265,7 +268,6 @@ class ImHandler(tornado.web.RequestHandler):
res
=
{}
res
[
"data"
]
=
data
# print(res)
self
.
write
(
res
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment