Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
lichenggang
/
bom_identify
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
4bd8fa67
authored
Jun 01, 2020
by
lzzzzl
Browse files
Options
_('Browse Files')
Download
Plain Diff
数量新增识别小数列
parents
17dc32e3
b2d26416
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
161 additions
and
80 deletions
client/client.py
extractor
model
predic_fac.py
predict/base_handler.py
predict/dict_predict.py
client/client.py
View file @
4bd8fa67
...
...
@@ -4,8 +4,7 @@ from protobuf import classify_pb2_grpc
import
pandas
as
pd
import
json
def
get_test_data
():
df
=
pd
.
read_excel
(
r'C:\Users\ICHUNT\Desktop\bomlist\00 BSJ BMS合并-询价-珠海能源.xlsx'
,
header
=
None
)
print
(
df
)
df
=
pd
.
read_excel
(
r'C:\Users\ICHUNT\Desktop\bomlist\DZ0901_V1.4_BOM.xlsx'
,
header
=
None
)
df
.
fillna
(
' '
,
inplace
=
True
)
dic_dft
=
df
.
to_dict
(
orient
=
'list'
)
return
json
.
dumps
(
dic_dft
)
...
...
extractor
View file @
4bd8fa67
No preview for this file type
model
View file @
4bd8fa67
No preview for this file type
predic_fac.py
View file @
4bd8fa67
...
...
@@ -29,7 +29,7 @@ if __name__ == "__main__":
def
get_test_data
():
import
pandas
as
pd
import
json
df
=
pd
.
read_excel
(
r'C:\
data\lx\51AB0571_ CCTV ASST询价_SZIMS.xlsx'
,
header
=
None
,
sheet_name
=
'1
'
)
df
=
pd
.
read_excel
(
r'C:\
Users\ICHUNT\Desktop\bomlist\51AB0571_ CCTV ASST询价_SZIMS.xlsx'
,
header
=
None
,
sheet_name
=
'3
'
)
df
.
fillna
(
'?'
,
inplace
=
True
)
dic_dft
=
df
.
to_dict
(
orient
=
'list'
)
return
json
.
dumps
(
dic_dft
)
...
...
@@ -40,7 +40,6 @@ if __name__ == "__main__":
data
=
get_test_data
()
p
=
PredictorFac
(
model_config
)
data
=
json
.
loads
(
data
)
res
=
p
.
predict
(
data
,
predict_type
=
'model'
)
print
(
data
)
res
=
p
.
predict
(
data
,
predict_type
=
'all'
)
print
(
res
)
predict/base_handler.py
View file @
4bd8fa67
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import
re
from
collections
import
Counter
from
utils.log_manager
import
get_logger
from
utils.robots
import
dd_send_msg
import
pandas
as
pd
NUMBER_LEVEL
=
0.7
CATEGORY
=
[
'二极管'
]
RIGHT_LEVEL
=
0.7
SEQ_LEVEL
=
0.5
CATE_LEVEL
=
0.5
MULTI_SAME_LEVEL
=
3
class
BasePredictor
:
'''
预测类基类
'''
# 占位符
PLACEHOLDER
=
'?'
def
__init__
(
self
,
name
,
extractor
,
classifier
):
self
.
name
=
name
self
.
classifier
=
classifier
...
...
@@ -22,16 +31,16 @@ class BasePredictor:
self
.
robot_msg
=
dd_send_msg
self
.
pd
=
pd
def
predict
(
self
,
key
):
raise
NotImplementedError
def
get_single_predict
(
self
,
string
:
object
)
->
tuple
:
def
get_single_predict
(
self
,
obj
:
object
)
->
tuple
:
'''
:param string: 接收单个要判断的字符串
:return tuple: 返回两个元素的元组, 第一个元素为判断结果, 第二个元素为可能性的字典
'''
string
=
str
(
obj
)
series
=
self
.
pd
.
Series
([
string
])
feature
=
self
.
extractor
.
transform
(
series
)
predictions
=
self
.
classifier
.
predict
(
feature
)
...
...
@@ -41,46 +50,80 @@ class BasePredictor:
dic_proba
=
{
k
:
v
for
k
,
v
in
zip
(
classes
,
deal_list
)}
return
predictions
[
0
],
dic_proba
def
isseq
(
self
,
data
):
"""
序号列预测
"""
collect_seq
=
[
int
(
kw
)
for
kw
in
data
if
isinstance
(
kw
,
float
)
or
isinstance
(
kw
,
int
)]
if
len
(
collect_seq
)
<
1
or
not
self
.
isIncrease
(
collect_seq
,
len
(
collect_seq
)):
return
False
else
:
rate
=
round
(
len
(
collect_seq
)
/
len
(
data
),
3
)
return
True
if
rate
>=
SEQ_LEVEL
else
False
def
valid_seq
(
self
,
data
):
"""取反"""
return
not
self
.
is_seq
(
data
)
def
valid_num
(
self
,
data
):
return
not
self
.
is_num
(
data
)
def
isnum
(
self
,
data
):
def
valid_cate
(
self
,
data
):
return
not
self
.
is_catecol
(
data
)
@classmethod
def
is_num
(
self
,
data
):
"""
数量列预测
"""
collect_num
=
[
kw
for
kw
in
data
if
isinstance
(
kw
,
int
)
or
self
.
isNumberCol
(
kw
)]
def
isNumberCol
(
kw
):
"""
是否是数量列辅助函数
"""
if
isinstance
(
kw
,
str
):
return
re
.
match
(
r'(\d+|\d+(\.\d+))($|(K)|([\u4E00-\u9FA5]{1,3}))$'
,
kw
,
re
.
M
|
re
.
I
)
else
:
return
False
collect_num
=
[
kw
for
kw
in
data
if
isinstance
(
kw
,
int
)
or
isinstance
(
kw
,
float
)
or
isNumberCol
(
str
(
kw
))]
rate
=
round
(
len
(
collect_num
)
/
len
(
data
),
3
)
return
True
if
rate
>=
NUMBER
_LEVEL
else
False
return
True
if
rate
>=
RIGHT
_LEVEL
else
False
def
isIncrease
(
self
,
arr
,
size
):
"""
判断列表元素是否递增
"""
if
size
==
1
:
return
True
return
(
arr
[
size
-
1
]
>=
arr
[
size
-
2
])
and
self
.
isIncrease
(
arr
,
size
-
1
)
@classmethod
def
is_catecol
(
self
,
data
):
cates
=
[]
for
i
in
data
:
for
j
in
CATEGORY
:
if
j
in
str
(
i
):
cates
.
append
(
i
)
rate
=
round
(
len
(
cates
)
/
len
(
data
),
3
)
return
rate
>=
CATE_LEVEL
@classmethod
def
is_multi_same
(
self
,
data
):
no_null_data
=
list
(
filter
(
lambda
x
:
x
!=
self
.
PLACEHOLDER
,
data
))
result
=
Counter
(
no_null_data
)
li_sort
=
sorted
(
result
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
return
li_sort
[
0
][
1
]
>=
MULTI_SAME_LEVEL
def
isNumberCol
(
self
,
kw
):
@classmethod
def
is_seq
(
self
,
data
):
"""
是否是数量列
序号列预测
"""
if
isinstance
(
kw
,
str
):
return
re
.
match
(
r'(\d+)((K)|([\u4E00-\u9FA5]{1,3}))$'
,
kw
,
re
.
M
|
re
.
I
)
else
:
def
isIncrease
(
arr
,
size
):
"""
判断列表元素是否递增
"""
if
size
==
1
:
return
True
return
(
arr
[
size
-
1
]
>=
arr
[
size
-
2
])
and
isIncrease
(
arr
,
size
-
1
)
collect_seq
=
[
int
(
kw
)
for
kw
in
data
if
isinstance
(
kw
,
float
)
or
isinstance
(
kw
,
int
)]
if
len
(
collect_seq
)
<
1
or
not
isIncrease
(
collect_seq
,
len
(
collect_seq
)):
return
False
else
:
rate
=
round
(
len
(
collect_seq
)
/
len
(
data
),
3
)
return
True
if
rate
>=
SEQ_LEVEL
else
False
def
valid_seq
(
self
,
data
):
"""取反"""
return
not
self
.
isseq
(
data
)
def
valid_num
(
self
,
data
):
return
not
self
.
isnum
(
data
)
\ No newline at end of file
if
__name__
==
"__main__"
:
li
=
[
'?'
,
3400.0
,
5920.0
,
4849.0
,
2544.0
,
3270.0
,
52751.0
,
2031.0
,
5302.0
,
726.0
,
1247.0
,
2472.0
,
689.0
,
6049.0
,
26796.0
,
6164.0
,
1605.0
,
4346.0
,
640.0
,
960.0
,
960.0
,
320.0
,
160.0
,
860.0
,
160.0
,
320.0
,
3183.0
,
10151.0
,
640.0
,
130.0
,
1237.0
,
800.0
,
960.0
,
3740.0
,
17701.0
,
2146.0
,
1280.0
,
160.0
,
1120.0
,
160.0
,
480.0
,
960.0
,
480.0
,
160.0
,
4717.0
,
160.0
,
160.0
,
160.0
,
640.0
,
160.0
,
320.0
,
160.0
,
160.0
,
800.0
,
800.0
,
480.0
,
1600.0
,
155.0
,
960.0
,
320.0
,
944.0
,
160.0
,
160.0
,
1280.0
,
1852.0
,
7680.0
,
7680.0
,
2880.0
,
160.0
,
224.0
,
480.0
,
480.0
,
640.0
,
160.0
,
640.0
,
320.0
,
1760.0
,
640.0
,
480.0
,
960.0
,
160.0
,
160.0
,
160.0
,
160.0
,
1920.0
,
160.0
,
5600.0
,
480.0
,
2560.0
,
160.0
,
160.0
,
160.0
,
160.0
,
160.0
,
1280.0
,
160.0
,
160.0
,
160.0
,
160.0
,
160.0
,
320.0
,
0.0
,
160.0
,
160.0
]
print
(
BasePredictor
.
is_num
(
li
))
predict/dict_predict.py
View file @
4bd8fa67
...
...
@@ -5,38 +5,42 @@ from collections import Counter
from
predict.base_handler
import
BasePredictor
# 可能的头部字段
prob_fields
=
[
"序号"
,
"名称"
,
"规格"
,
"MPN"
,
"用量(pcs)"
,
"用量"
,
"pcs"
,
"位号"
,
"描述"
,
"值"
,
"数量"
,
"封装"
,
"类别"
,
"a面位置"
,
"b面位置"
,
"备注"
,
"需求数量"
,
"售价"
,
PROB_FIELDS
=
[
"序号"
,
"名称"
,
"规格"
,
"MPN"
,
"用量(pcs)"
,
"用量"
,
"pcs"
,
"位号"
,
"描述"
,
"值"
,
"数量"
,
"封装"
,
"类别"
,
"a面位置"
,
"b面位置"
,
"备注"
,
"需求数量"
,
"售价"
,
"封装"
,
"封装规格"
,
"参考品牌"
,
"品牌"
,
"item"
,
"厂商编码"
,
"品牌/厂商"
,
"参考料号"
,
"参考供应商"
,
"top面"
,
"bottom面"
]
# 标准名和代名词的映射
fields_map
=
{
"序号"
:
[
"序号"
],
STD_FIELDS_MAP
=
{
"类别"
:
[
"类别"
,
"分类"
,
"名称"
,
"类别名称"
],
"参数"
:
[
"参数"
,
"规格"
,
"描述"
],
"参数"
:
[
"参数"
,
"规格"
,
"描述"
,
"值"
],
"型号"
:
[
"型号"
,
"参考料号"
,
"料号"
,
"MPN"
,
"厂商编码"
],
"数量"
:
[
"数量"
,
"用量(pcs)"
,
"PCS"
,
"用量"
,
"用量(PCS)"
,
"pcs"
],
"封装"
:
[
"封装"
,
"封装规格"
],
"品牌"
:
[
"品牌"
,
"品牌/厂商"
,
"参考品牌"
,
"参考供应商"
,
"厂商"
,
"参考供应商"
,
"参考厂商"
]}
# 必须返回也必须验证的标准字段
MUST_STD_FIELDS
=
[
'参数'
,
'数量'
]
#
order_list
=
[
'序号'
]
en_to_zh_map
=
{
'brand_name'
:
'品牌'
,
'param'
:
'参数'
,
'goods_name'
:
'型号'
}
def
fun
(
seri
):
li_seri
=
seri
.
tolist
()
for
field
in
li_seri
:
if
str
(
field
)
.
lower
()
in
prob_fields
:
if
str
(
field
)
.
lower
()
in
PROB_FIELDS
:
return
field
,
seri
.
name
def
repeat_max
(
li
):
result
=
Counter
(
li
)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort
=
sorted
(
result
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
return
li_sort
[
0
][
0
]
# 取前多少行
HEAD_ROW
=
7
# 空置率阈值
NAN_RATE
=
0.8
# 占位符
PLACEHOLDER
=
'?'
class
DicPredict
(
BasePredictor
):
...
...
@@ -56,14 +60,14 @@ class DicPredict(BasePredictor):
'pronoun'
:
field
,
'column_name'
:
column_name
}
for
k
,
v
in
fields_map
.
items
():
if
field
.
lower
()
in
fields_map
[
k
]:
for
k
,
v
in
STD_FIELDS_MAP
.
items
():
if
field
.
lower
()
in
STD_FIELDS_MAP
[
k
]:
dic
[
'std_name'
]
=
k
li_res
.
append
(
dic
)
return
li_res
def
pre
_predict
(
self
,
dict_data
):
def
head
_predict
(
self
,
dict_data
):
columns
=
[]
li_data
=
[]
...
...
@@ -75,15 +79,13 @@ class DicPredict(BasePredictor):
dft
=
df
.
T
.
head
(
HEAD_ROW
)
dft
.
columns
=
columns
li_res_raw
=
self
.
id_by_field
(
dft
)
std_result
=
[]
ab_result
=
[]
std_result
=
{}
ab_result
=
{}
for
i
in
li_res_raw
:
if
i
.
get
(
'std_name'
):
dic_has_res
=
{
i
[
'column_name'
]:
i
[
'std_name'
]}
std_result
.
append
(
dic_has_res
)
std_result
[
i
[
'column_name'
]]
=
i
[
'std_name'
]
else
:
dic_ab_res
=
{
i
[
'column_name'
]:
i
[
'pronoun'
]}
ab_result
.
append
(
dic_ab_res
)
ab_result
[
i
[
'column_name'
]]
=
i
[
'pronoun'
]
pre_id_res
=
{
'std_result'
:
std_result
,
'ab_result'
:
ab_result
,
...
...
@@ -92,54 +94,73 @@ class DicPredict(BasePredictor):
def
model_predict
(
self
,
dic_data
):
"""
该方法目前只对[
参数, 型号, 数量, 品牌]进行
预测
该方法目前只对[
序号(非标准), 数量, 类别, 参数, 型号, 品牌]进行预测, 前三者是非模型
预测
:param dic_data:
:return
:
:return
:只有[参数, 数量]会强制有结果
"""
print
(
dic_data
)
prob_columns
=
[]
temp_pre_model_res
=
{}
ab_result
=
{}
for
k
,
v
in
dic_data
.
items
():
bol
=
self
.
v_chain
(
v
)
if
bol
:
print
(
k
,
bol
)
prob_columns
.
append
(
k
)
continue
if
self
.
isnum
(
v
):
if
self
.
is_seq
(
v
):
ab_result
[
k
]
=
'序号'
continue
if
self
.
is_num
(
v
):
temp_pre_model_res
[
k
]
=
'数量'
continue
if
self
.
isseq
(
v
):
temp_pre_model_res
[
k
]
=
'序号'
temp_dic_data
=
{
k
:
list
(
filter
(
lambda
x
:
x
!=
PLACEHOLDER
,
dic_data
[
k
]))
for
k
in
prob_columns
}
if
self
.
is_catecol
(
v
):
temp_pre_model_res
[
k
]
=
'类别'
continue
temp_dic_data
=
{
k
:
list
(
filter
(
lambda
x
:
x
!=
self
.
PLACEHOLDER
,
dic_data
[
k
]))
for
k
in
prob_columns
}
for
k
,
v
in
temp_dic_data
.
items
():
li_single_pred_res
=
[]
for
string
in
v
:
single_pred_res
,
probdic
=
self
.
get_single_predict
(
string
)
li_single_pred_res
.
append
(
single_pred_res
)
result
=
Counter
(
li_single_pred_res
)
# [('brand_name', 4), ('goods_name', 3), ('param', 2)]
li_sort
=
sorted
(
result
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
print
(
k
,
li_sort
)
temp_pre_model_res
[
k
]
=
en_to_zh_map
[
li_sort
[
0
][
0
]]
print
(
temp_pre_model_res
)
pre_model_res
=
{}
result
=
repeat_max
(
li_single_pred_res
)
temp_pre_model_res
[
k
]
=
en_to_zh_map
[
result
]
# 参数和型号列出现多条相同值则丢弃
prob_param_and_gn_cols
=
[
i
for
i
in
temp_pre_model_res
if
temp_pre_model_res
[
i
]
==
'参数'
or
temp_pre_model_res
[
i
]
==
'型号'
]
for
col
in
prob_param_and_gn_cols
:
if
self
.
is_multi_same
(
temp_dic_data
[
col
]):
temp_pre_model_res
.
pop
(
col
)
model_id_res
=
{
'std_result'
:
temp_pre_model_res
,
'ab_result'
:
ab_result
,
}
return
model_id_res
def
predict
(
self
,
dic_data
,
predict_type
=
'all'
):
dic_data
=
self
.
pre_deal
(
dic_data
)
if
predict_type
==
'all'
:
pre_id_res
=
self
.
pre_predict
(
dic_data
)
if
pre_id_res
:
return
pre_id_res
pre_id_res
=
self
.
head_predict
(
dic_data
)
model_id_res
=
self
.
model_predict
(
dic_data
)
# 表头预测和模型预测最后返回的数据进行综合处理
pre_std_result
=
pre_id_res
.
get
(
'std_result'
)
model_std_result
=
model_id_res
.
get
(
'std_result'
)
comprehensive_res
=
self
.
get_comprehensive_res
(
pre_std_result
,
model_std_result
)
if
comprehensive_res
:
res
=
{
'std_result'
:
comprehensive_res
,
'ab_result'
:
pre_id_res
[
'ab_result'
],
}
return
res
elif
predict_type
==
'model'
:
model_id_res
=
self
.
model_predict
(
dic_data
)
if
model_id_res
:
return
model_id_res
elif
predict_type
==
'pre'
:
pre_id_res
=
self
.
pre_predict
(
dic_data
)
elif
predict_type
==
'head'
:
pre_id_res
=
self
.
head_predict
(
dic_data
)
if
pre_id_res
:
return
pre_id_res
...
...
@@ -149,7 +170,7 @@ class DicPredict(BasePredictor):
# 去掉空置率大于等于0.8的列
counter
=
0
for
item
in
v
:
if
str
(
item
)
.
strip
()
==
PLACEHOLDER
:
if
str
(
item
)
.
strip
()
==
self
.
PLACEHOLDER
:
counter
+=
1
if
counter
/
len
(
v
)
<=
NAN_RATE
:
new_dic_data
[
k
]
=
v
...
...
@@ -170,3 +191,23 @@ class DicPredict(BasePredictor):
else
:
return
True
def
get_comprehensive_res
(
self
,
pre_std_result
,
model_std_result
):
vote_count
=
{
"类别"
:
[],
"参数"
:
[],
"型号"
:
[],
"数量"
:
[],
"品牌"
:
[]
}
for
k
,
v
in
pre_std_result
.
items
():
vote_count
[
v
]
.
append
(
k
)
for
k
,
v
in
model_std_result
.
items
():
vote_count
[
v
]
.
append
(
k
)
comprehensive_res
=
{}
for
std_name
,
col_li
in
vote_count
.
items
():
if
len
(
col_li
)
>=
1
:
col
=
repeat_max
(
col_li
)
comprehensive_res
[
col
]
=
std_name
return
comprehensive_res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment