Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
lichenggang
/
bom_identify
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
1bed00a1
authored
Jul 23, 2021
by
陈森彬
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
新增询报价聊天识别
parent
5b14b1b3
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
293 additions
and
39 deletions
README.md
config.py
utils/functions.py
utils/status.py
valid_server.py
README.md
View file @
1bed00a1
...
...
@@ -39,7 +39,7 @@
-
启动单个字符串验证的服务: python3 valid_server.py
### 所在目录
测试环境:
IP: 192.168.
2.23
2
IP: 192.168.
1.25
2
PATH: /home/lic/bom_identify
正式环境:
...
...
config.py
View file @
1bed00a1
...
...
@@ -15,6 +15,23 @@ else:
model_config
[
'modextr_path'
]
=
project_path
+
r'/'
+
'models_and_extractors/'
redis_config
=
{
'host'
:
'127.0.0.1'
,
'port'
:
6379
,
'decode_responses'
:
True
,
'password'
:
'icDb29mLy2s'
}
MG_HOST_SET
=
{
'test'
:
'192.168.1.237'
,
'produce'
:
'172.18.137.23'
}
def
get_mongo_conf
():
host
=
MG_HOST_SET
[
ENVIRONMENT
]
psd
=
'huntmon66499'
if
ENVIRONMENT
==
'produce'
else
'huntmon6699'
conf
=
{
"host"
:
host
,
"port"
:
27017
,
"database"
:
'ichunt'
,
"user"
:
'ichunt'
,
"password"
:
psd
}
return
conf
unit_map
=
{
'Ω'
:
{
'μΩ'
:
0.000001
,
...
...
@@ -124,7 +141,7 @@ encap_list = ["0030", "008004", "01005", "015008", "0201", "02016", "0202", "020
temp_map
=
[
"C0G"
,
"NP0"
,
"COG"
,
"NPO"
,
"X7R"
,
"X5R"
,
"Y5V"
,
"X6S"
,
"X7S"
,
"X7T"
,
"SL"
,
"U2J"
,
"UJ"
,
"X7U"
,
"X8R"
,
"Z5U"
,
"C0H"
,
"COH"
,
"U2K"
,
"X6T"
,
"X8G"
,
"X8L"
,
"Y5R"
,
"Y5U"
,
"ZLM"
]
unit_regex
=
"μΩ|uΩ|mΩ|Ω|kΩ|KΩ|MΩ|GΩ|TΩ|pF|PF|Pf|pf|nF|NF|Nf|nf|µF|μF|uF|UF|Uf|uf|mF|MF|Mf|mf|F|pH|Ph|PH|ph|nH|µH|UH|μh|uh|Uh|uH|mH|Mh|MH|mh|H|mA|A|a|V|v|kV|Kv|kv|KV|W|w|kW|kw|KW|Kw|
%
|毫欧|欧姆|欧|千欧|兆欧|伏特|伏|千伏|瓦特|瓦"
unit_regex
=
"μΩ|uΩ|mΩ|Ω|kΩ|KΩ|MΩ|GΩ|TΩ|
f|
pF|PF|Pf|pf|nF|NF|Nf|nf|µF|μF|uF|UF|Uf|uf|mF|MF|Mf|mf|F|pH|Ph|PH|ph|nH|µH|UH|μh|uh|Uh|uH|mH|Mh|MH|mh|H|mA|A|a|V|v|kV|Kv|kv|KV|W|w|kW|kw|KW|Kw|
%
|毫欧|欧姆|欧|千欧|兆欧|伏特|伏|千伏|瓦特|瓦"
r_regex
=
"Rr"
...
...
@@ -136,8 +153,10 @@ split_char = '|,,/ '
special_tuple
=
(
"-"
,)
special_str
=
"~!@#$
%
^&*()_+
-
*/<>,.。,[]
\
/"
special_str
=
"~!@#$
%
^&*()_+*/<>,.。,[]
\
/"
attr_regex
=
"F|H|K|A|W|KW"
special_attr_regex
=
"Ω"
replace_char
=
"|/。,或各和::"
utils/functions.py
View file @
1bed00a1
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import
re
from
config
import
unit_map
,
cast_map
,
unit_regex
,
special_tuple
,
attr_regex
,
temp_map
,
special_str
,
encap_regex
,
\
r_regex
,
legal_char
,
split_char
,
special_attr_regex
from
predict.kw_predict
import
KwPredict
from
config
import
*
from
utils.mongo_opera
import
MongoOperator
unit_pattern
=
re
.
compile
(
'([.a-zA-Z
\\
-]?)(
\
d+|
\
d+[
\\
\
./?]
\
d+)('
+
unit_regex
+
')([.a-zA-Z
\\
-]?)'
)
attr_pattern
=
re
.
compile
(
'(
\
d+|
\
.
\
d+)('
+
attr_regex
+
')('
+
special_attr_regex
+
')'
)
attr_pattern
=
re
.
compile
(
'(
\
d+|
\
.
\
d+)('
+
attr_regex
+
')('
+
special_attr_regex
+
')'
)
r_pattern
=
re
.
compile
(
'([.a-zA-Z
\\
-]?)(
\
d+['
+
r_regex
+
']
\
d+|['
+
r_regex
+
']
\
d+|
\
d+['
+
r_regex
+
'])([.a-zA-Z
\\
-]?)'
)
encap_pattern
=
re
.
compile
(
'(.{0,2})('
+
encap_regex
+
')(.{0,2})'
,
re
.
I
)
num_pattern
=
re
.
compile
(
'(
\
s|[
\u4e00
-
\u9fa5
])(
\
d+
\
.
\
d+|
\
d+)(
\
s|个|片)'
)
k_num_pattern
=
re
.
compile
(
'(
\
s|[
\u4e00
-
\u9fa5
])(
\
d+
\
.
\
d+|
\
d+)(K)'
,
re
.
I
)
pcs_num_pattern
=
re
.
compile
(
'(
\
s|[
\u4e00
-
\u9fa5
])(
\
d+
\
.
\
d+|
\
d+)(pcs)'
,
re
.
I
)
price_num_pattern
=
re
.
compile
(
'(
\
d+
\
.
\
d+|
\
d+)'
)
zh_pattern
=
re
.
compile
(
'[
\u4e00
-
\u9fa5
]+'
)
usd_num_pattern
=
re
.
compile
(
'(
\
d+
\
.
\
d+|
\
d+)(usd)'
,
re
.
I
)
prefixchar
=
[
i
+
j
for
i
in
split_char
for
j
in
legal_char
]
tailchar
=
[
i
+
j
for
i
in
legal_char
for
j
in
split_char
]
prefixchar
.
extend
(
list
(
legal_char
))
...
...
@@ -16,6 +23,33 @@ tailchar.extend(list(split_char))
prefixchar
.
append
(
''
)
tailchar
.
append
(
''
)
mongo_op
=
MongoOperator
()
def
predict_gn_param
(
predict
:
KwPredict
,
kw
)
->
int
:
'''
传入预测类实例和关键字,返回是否是型号名或参数
:param predict: 预测类实例
:param kw: 关键字
:return: 1|0
'''
pat
=
re
.
compile
(
r'(?:.*\-.*|\d+)'
)
if
len
(
kw
)
<=
1
:
return
0
elif
pat
.
findall
(
kw
):
return
1
else
:
res1
=
predict
.
predict
(
kw
,
'gn'
)
res2
=
predict
.
predict
(
kw
,
'param'
)
if
res1
[
'result'
]
==
1
or
res2
[
'result'
]
==
1
:
return
1
else
:
return
0
predic
=
KwPredict
(
'validSingle'
)
def
is_float
(
s
):
xiaoshu_new
=
str
(
s
)
...
...
@@ -134,7 +168,7 @@ def unit_conversion(unit_res, kw):
if
"/"
not
in
kw
:
# 单独处理除号
if
"
%
"
in
kw
:
# 百分号的单独处理
if
"-"
in
unit_str
:
unit_str
=
unit_str
.
replace
(
"-"
,
""
)
unit_str
=
unit_str
.
replace
(
"-"
,
""
)
unit_num
=
int
(
kw
.
replace
(
unit_str
,
""
)
.
replace
(
"-"
,
""
))
else
:
if
"."
in
kw
:
...
...
@@ -321,3 +355,93 @@ def check_attr(kw_info):
if
res
[
0
][
0
]
and
res
[
0
][
1
]
and
"."
not
in
res
[
0
][
0
]
and
not
res
[
0
][
2
]:
return
True
return
False
def
check_k_num
(
kw_info
):
'''
匹配包含带K的数量
:param kw_info:
:return:
'''
res
=
k_num_pattern
.
findall
(
kw_info
)
if
not
res
:
return
False
else
:
res_num
=
res
[
0
][
1
]
+
res
[
0
][
2
]
if
"."
in
res
[
0
][
1
]:
res
=
str
(
delete_extra_zero
(
float
(
res
[
0
][
1
])
*
1000
))
else
:
res
=
str
(
int
(
res
[
0
][
1
])
*
1000
)
return
res
,
res_num
# return res
def
check_pcs_num
(
kw_info
):
'''
匹配包含pcs的数量
:param kw_info:
:return:
'''
res
=
pcs_num_pattern
.
findall
(
kw_info
)
if
not
res
:
return
False
else
:
return
res
[
0
][
1
]
.
strip
()
def
check_num
(
kw_info
):
'''
匹配纯数字的情况
:param kw_info:
:return:
'''
res
=
num_pattern
.
findall
(
kw_info
)
if
not
res
:
return
False
else
:
return
res
[
0
][
1
]
.
strip
()
def
check_price
(
kw_info
):
'''
匹配出价格
:param kw_info:
:return:
'''
if
"价"
in
kw_info
or
"税"
in
kw_info
:
res
=
price_num_pattern
.
findall
(
kw_info
)
if
res
:
return
res
[
0
]
.
strip
()
else
:
return
False
else
:
res
=
usd_num_pattern
.
findall
(
kw_info
)
if
res
:
return
res
[
0
][
0
]
.
strip
()
else
:
return
False
def
check_zh
(
kw_info
):
'''
确认是否包含中文
:param kw_info:
:return:
'''
res
=
zh_pattern
.
findall
(
kw_info
)
if
not
res
:
return
True
else
:
return
False
def
replace_symbol
(
kw
):
'''
将特殊符号换成空格,统一分隔符
:param kw:
:return:
'''
for
replace_info
in
replace_char
:
kw
=
kw
.
replace
(
replace_info
,
" "
)
return
kw
utils/status.py
View file @
1bed00a1
...
...
@@ -27,6 +27,7 @@ class StatusCode:
"100003"
:
(
"no qty"
,
"未检测到数量"
),
"100004"
:
(
"no param or gn"
,
"未检测到参数/型号列"
),
"100005"
:
(
"not yet included"
,
"中文映射还未收录"
),
"100006"
:
(
"not origin"
,
"没有请求来源"
),
}
...
...
valid_server.py
View file @
1bed00a1
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import
re
import
json
from
urllib.parse
import
unquote
import
tornado.web
import
tornado.ioloop
from
utils.functions
import
pre_judge
,
word_conversion
,
cut_params
,
check_param
,
unit_conversion
,
get_not_exist_list
,
\
check_encap
,
check_temp
,
check_symbol
,
check_param_r
,
check_param_k
,
check_attr
from
utils.functions
import
*
from
utils.redis_cli
import
redis_cli
from
predict.kw_predict
import
KwPredict
from
utils.status
import
code2msg
import
re
def
predict_gn_param
(
predict
:
KwPredict
,
kw
)
->
int
:
'''
传入预测类实例和关键字,返回是否是型号名或参数
:param predict: 预测类实例
:param kw: 关键字
:return: 1|0
'''
pat
=
re
.
compile
(
r'(?:.*\-.*|\d+)'
)
if
len
(
kw
)
<=
1
:
return
0
elif
pat
.
findall
(
kw
):
return
1
else
:
res1
=
predict
.
predict
(
kw
,
'gn'
)
res2
=
predict
.
predict
(
kw
,
'param'
)
if
res1
[
'result'
]
==
1
or
res2
[
'result'
]
==
1
:
return
1
else
:
return
0
class
KwHandler
(
tornado
.
web
.
RequestHandler
):
pat
=
re
.
compile
(
r'(?:.*\-.*|\d+)'
)
...
...
@@ -152,11 +125,147 @@ class UCHandler(tornado.web.RequestHandler):
words2_list
.
append
(
kw_info
)
res
[
'status'
]
=
1
res
[
'words'
]
=
list
(
set
(
words_list
))
#
bom使用
res
[
'words'
]
=
list
(
set
(
words_list
))
#
bom使用
res
[
'attrs'
]
=
list
(
set
(
attrs_list
))
res
[
'encap'
]
=
list
(
set
(
encap_list
))
res
[
'words2'
]
=
list
(
set
(
words2_list
))
#words2 前台搜索使用
print
(
res
)
res
[
'words2'
]
=
list
(
set
(
words2_list
))
# words2 前台搜索使用
# print(res)
self
.
write
(
res
)
class
ImHandler
(
tornado
.
web
.
RequestHandler
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
predic
=
KwPredict
(
'validSingle'
)
self
.
redis_cli
=
redis_cli
self
.
num_pattern
=
re
.
compile
(
''
)
# 单位匹配
async
def
post
(
self
):
data_list
=
json
.
loads
(
self
.
request
.
body
)
if
not
data_list
:
code
=
'100002'
res
=
code2msg
(
code
)
elif
type
(
data_list
)
!=
list
:
code
=
'100001'
res
=
code2msg
(
code
)
else
:
brand_set
=
redis_cli
.
smembers
(
'brand_set'
)
data
=
[]
for
data_info
in
data_list
:
data_obj
=
{}
data_obj
[
"gn"
]
=
[]
data_obj
[
"brand"
]
=
""
data_obj
[
"number"
]
=
""
data_obj
[
"price"
]
=
""
data_obj
[
"encap"
]
=
""
new_data_info
=
[]
data_info
=
replace_symbol
(
data_info
)
.
replace
(
"("
,
"("
)
.
replace
(
")"
,
")"
)
# 统一分隔符,替换中文括号
data_cut_list
=
data_info
.
split
(
" "
)
new_cut_info
=
""
for
cut_info
in
data_cut_list
:
# 第一遍过滤
if
cut_info
.
strip
():
new_data_info
.
append
(
cut_info
)
new_cut_info
+=
cut_info
+
" "
for
new_info
in
new_data_info
:
if
new_info
in
brand_set
:
data_obj
[
"brand"
]
=
new_info
else
:
resp1
=
self
.
predic
.
predict
(
new_info
,
'all'
)
resp2
=
self
.
predic
.
predict
(
new_info
,
'brand'
)
zh_res
=
check_zh
(
new_info
)
if
resp1
[
'result'
]
==
'品牌'
and
resp2
[
'result'
]
and
"HK"
not
in
new_info
and
zh_res
:
#判断是否含有中文,若含有中文,且品牌集合里面没有录入,则不当型号
data_obj
[
"brand"
]
=
new_info
# elif resp1['result'] != '品牌' and resp2['result']:
#
# elif resp1['result'] == '品牌' and not resp2['result']:
else
:
encap_res
=
check_encap
(
new_info
)
# 确认是不是封装,只匹配标准封装
if
encap_res
:
data_obj
[
"encap"
]
=
new_info
else
:
gn_res
=
self
.
predic
.
predict
(
new_info
,
'gn'
)
k_res
=
check_k_num
(
" "
+
new_info
)
pcs_res
=
check_pcs_num
(
" "
+
new_info
)
if
gn_res
[
"result"
]
and
not
k_res
and
not
new_info
.
strip
()
.
isdigit
()
and
not
pcs_res
and
check_zh
(
new_info
):
# 剔除纯数字带K、pcs结尾的特殊情况,并且型号不能带有中文
data_obj
[
"gn"
]
.
append
(
new_info
)
if
data_obj
[
"brand"
]:
new_cut_info
=
new_cut_info
.
replace
(
data_obj
[
"brand"
],
""
,
1
)
if
data_obj
[
"encap"
]:
new_cut_info
=
new_cut_info
.
replace
(
data_obj
[
"encap"
],
""
)
if
data_obj
[
"gn"
]:
for
gn
in
data_obj
[
"gn"
]:
new_cut_info
=
new_cut_info
.
replace
(
gn
,
""
)
new_cut_list
=
new_cut_info
.
split
(
" "
)
for
list_info
in
new_cut_list
:
# 数量价格提取
if
list_info
.
strip
():
kw_word
=
" "
+
list_info
k_res
=
check_k_num
(
kw_word
)
if
k_res
:
data_obj
[
"number"
]
=
k_res
[
0
]
kw_word
=
kw_word
.
replace
(
k_res
[
1
],
""
)
pcs_res
=
check_pcs_num
(
kw_word
)
if
pcs_res
and
not
data_obj
[
"number"
]:
data_obj
[
"number"
]
=
pcs_res
kw_word
=
kw_word
.
replace
(
pcs_res
,
""
)
num_res
=
check_num
(
kw_word
+
" "
)
if
num_res
and
not
data_obj
[
"number"
]:
data_obj
[
"number"
]
=
num_res
kw_word
=
kw_word
.
replace
(
num_res
,
""
)
price_res
=
check_price
(
kw_word
+
" "
)
if
price_res
:
data_obj
[
"price"
]
=
price_res
if
k_res
or
pcs_res
or
num_res
or
price_res
:
new_cut_info
=
new_cut_info
.
replace
(
list_info
,
""
)
#第一种写法
# kw_word = " " + list_info
# k_res = check_k_num(kw_word)
#
# if k_res:
# data_obj["number"] = k_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
#
# pcs_res = check_pcs_num(kw_word)
# if pcs_res:
# data_obj["number"] = pcs_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
# kw_word = kw_word + " "
# num_res = check_num(kw_word)
# print(num_res,kw_word)
# if num_res and list_info != new_data_info[0]:
# data_obj["number"] = num_res
# new_cut_info = new_cut_info.replace(list_info, "")
# else:
# price_res = check_price(kw_word)
# if price_res and not data_obj["price"]:
# data_obj["price"] = price_res
# new_cut_info = new_cut_info.replace(list_info, "")
if
new_cut_info
.
strip
():
end_cut_list
=
new_cut_info
.
split
(
" "
)
len_lsit
=
[]
for
end_cut_info
in
end_cut_list
:
# 将剩下的未知单词剔除包含中文的单词
if
end_cut_info
.
strip
()
and
check_zh
(
end_cut_info
.
strip
())
and
len
(
end_cut_info
.
strip
())
>
6
:
len_lsit
.
append
(
end_cut_info
)
if
len_lsit
and
len
(
len_lsit
)
==
1
and
not
data_obj
[
"gn"
]:
# 若剔除完中文后,此时型号还未空,则剩下的词很大概率是型号,暂时按时型号处理
data_obj
[
"gn"
]
.
append
(
len_lsit
[
0
]
.
strip
())
if
data_obj
:
data
.
append
(
data_obj
)
# print(data_obj)
res
=
{}
res
[
"data"
]
=
data
# print(res)
self
.
write
(
res
)
...
...
@@ -166,6 +275,7 @@ def gen_app():
register_tornado_handlers
=
{
'/keyword'
:
KwHandler
,
'/unit_conversion'
:
UCHandler
,
'/identify_meaning'
:
ImHandler
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment