Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
岳巧源
/
my-awesome-project
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
61152668
authored
Jun 21, 2024
by
岳巧源
Browse files
Options
_('Browse Files')
Download
Email Patches
Plain Diff
add new rex
parent
a85343c1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
31 deletions
main.py
main.py
View file @
61152668
...
...
@@ -45,7 +45,7 @@ query = {
}
query2
=
{
"_source"
:
[
"goods_id"
,
"goods_name"
,
"brand_name
"
],
"_source"
:
[
"goods_id"
,
"goods_name"
,
"brand_name"
,
"goods_name_origin
"
],
"query"
:
{
"bool"
:
{
"must"
:
[
...
...
@@ -89,10 +89,12 @@ black_list = {"eastsheep", "ABB", "BRsanyuan", "JCSTRONG", "TECHNOLOGY",
"HIKVISION"
,
"UGREEN"
,
"xiaovv"
,
"maxhub"
,
"SAMSUNG"
,
"SAMZHE"
,
"ALINX"
,
"Xilinx"
,
"Lenovo"
,
"MeeTap"
,
"BCNET"
,
"LANGQING"
,
"ORICO"
,
"HUB"
,
"Panasonic"
,
"SDKELI"
,
"10m"
,
"9m"
,
"8m"
,
"7m"
,
"6m"
,
"5m"
,
"4m"
,
"3m"
,
"2m"
,
"1m"
"ORICO"
,
"HUB"
,
"Panasonic"
,
"SDKELI"
,
"10m"
,
"9m"
,
"8m"
,
"7m"
,
"6m"
,
"5m"
,
"4m"
,
"3m"
,
"2m"
,
"1m"
,
"TI"
,
"HX"
,
"Isabellenhuette"
,
}
count
=
0
# 如果是采用单线程的模式跑这个脚本,解析完4000多行数据恐怕要1个小时了 所以还是采用多进程的方式吧 用四个脚本同时解析一个文件的不同行
def
main
():
df
=
pd
.
read_excel
(
'jd.xlsx'
)
...
...
@@ -101,7 +103,7 @@ def main():
result
=
[]
goods_name_list
=
[]
for
i
in
range
(
len
(
data
)):
# for debug, don't forget to delete this line.
# for debug, don't forget to delete this line.
sku_name_str
=
data
[
i
][
0
]
res_tmp
=
match_sku
(
sku_name_str
)
ans
.
append
(
res_tmp
)
...
...
@@ -111,7 +113,7 @@ def main():
for
j
in
range
(
len
(
ans
[
i
])):
tmp_goods_name
=
ans
[
i
][
j
]
tmp_query
=
query2
new_str
=
tmp_goods_name
.
replace
(
'/'
,
''
)
.
replace
(
'-'
,
''
)
new_str
=
tmp_goods_name
.
replace
(
'/'
,
''
)
.
replace
(
'-'
,
''
)
.
replace
(
'.'
,
''
)
.
replace
(
','
,
''
)
tmp_query
[
"query"
][
"bool"
][
"must"
][
0
][
"match"
][
"goods_name"
]
=
new_str
tmp_map
=
process
(
tmp_query
,
tmp_goods_name
)
if
len
(
tmp_map
)
!=
0
:
...
...
@@ -129,7 +131,8 @@ def main():
index
=
k
result
.
append
(
score_in_one_row
[
index
])
print
(
i
,
"------"
,
score_in_one_row
[
index
])
return
result
#返回一个列表,列表中要么为空字典,要么为 {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score} 字典
return
result
# 返回一个列表,列表中要么为空字典,要么为 {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score} 字典
def
match_sku
(
s
:
str
)
->
[]:
# s = "RG58/U射频连接线SMA公头转BNC公头延长线 SMA/BNC-JJ Q9/SMA-JJ RG58/U-SMA/BNC-JJ 2m"
...
...
@@ -140,9 +143,20 @@ def match_sku(s: str) -> []:
if
len
(
tmp
)
==
0
or
tmp
in
black_list
:
continue
# 正则表达式检测被空格分开的每个字符串,每个字符串 最多检测三个 英文字母,"/", "-", 数字0-9组成的子字符串
# 此正则表达式暂时启用
# pattern = re.compile(
# r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*')
pattern
=
re
.
compile
(
r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*'
)
r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*'
)
pat2
=
re
.
compile
(
r'[0-9a-zA-Z]{1,2}[^0-9a-zA-Z/-]{1,}([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*'
)
pat3
=
re
.
compile
(
r'.*)([0-9a-zA-Z][0-9a-zA-Z/-]{2,}).*'
)
m
=
pattern
.
match
(
tmp
)
if
m
is
None
:
m
=
pat2
.
match
(
tmp
)
if
m
is
None
:
m
=
pat3
.
match
(
tmp
)
if
m
is
not
None
:
for
k
in
range
(
len
(
m
.
groups
())):
tmp_str
=
m
.
groups
()[
k
]
...
...
@@ -153,7 +167,7 @@ def match_sku(s: str) -> []:
# 检查是否为纯数字,纯数字不会加入结果列表
rex
=
re
.
compile
(
"^[0-9]*$"
)
mat
=
rex
.
search
(
tmp_str
)
if
mat
is
not
None
:
if
mat
is
not
None
and
len
(
tmp_str
)
<=
4
:
continue
res
.
append
(
tmp_str
)
return
res
...
...
@@ -164,64 +178,72 @@ def post_to_elasticsearch(q: dict):
ans
=
response
.
json
()
return
ans
def
process
(
query_str
:
dict
,
key_word
:
str
):
res
=
post_to_elasticsearch
(
query_str
)
if
res
[
"hits"
][
"max_score"
]
is
not
None
and
res
[
"hits"
][
"max_score"
]
>
80
:
if
res
[
"hits"
][
"max_score"
]
is
not
None
and
res
[
"hits"
][
"max_score"
]
>
80
:
for
i
in
range
(
len
(
res
[
"hits"
][
"hits"
])):
if
res
[
"hits"
][
"hits"
][
i
][
"_score"
]
==
res
[
"hits"
][
"max_score"
]:
_score
=
res
[
"hits"
][
"hits"
][
i
][
"_score"
]
_goods_name
=
res
[
"hits"
][
"hits"
][
i
][
"_source"
][
"goods_name"
]
_goods_id
=
res
[
"hits"
][
"hits"
][
i
][
"_source"
][
"goods_id"
]
_goods_id
=
res
[
"hits"
][
"hits"
][
i
][
"_source"
][
"goods_id"
]
_brand_name
=
res
[
"hits"
][
"hits"
][
i
][
"_source"
][
"brand_name"
]
return
{
"goods_name"
:
_goods_name
,
"goods_id"
:
_goods_id
,
"brand_name"
:
_brand_name
,
"score"
:
_score
,
"key_word"
:
key_word
}
_goods_name_origin
=
res
[
"hits"
][
"hits"
][
i
][
"_source"
][
"goods_name_origin"
]
return
{
"goods_name"
:
_goods_name
,
"goods_id"
:
_goods_id
,
"brand_name"
:
_brand_name
,
"score"
:
_score
,
"key_word"
:
key_word
,
"goods_name_origin"
:
_goods_name_origin
}
return
{}
# 覆盖写
def
write_to_xlsx
(
info_map_list
:
list
):
table
=
{
'goods_name'
:
[],
'goods_id'
:
[],
'brand_name'
:
[]}
for
i
in
range
(
len
(
info_map_list
)):
if
len
(
info_map_list
[
i
])
!=
0
:
table
[
"goods_name"
]
.
append
(
info_map_list
[
i
][
"goods_name"
])
table
[
"goods_id"
]
.
append
(
str
(
info_map_list
[
i
][
"goods_id"
]))
table
[
"brand_name"
]
.
append
(
info_map_list
[
i
][
"brand_name"
])
else
:
table
[
"goods_name"
]
.
append
(
''
)
table
[
"goods_id"
]
.
append
(
''
)
table
[
"brand_name"
]
.
append
(
''
)
df
=
pd
.
DataFrame
(
table
)
df
.
to_excel
(
'ans.xlsx'
,
sheet_name
=
'Sheet1'
,
index
=
False
,
startcol
=
3
)
# def write_to_xlsx(info_map_list: list):
# table = {'goods_name': [], 'goods_id': [], 'brand_name': [], 'goods_name_origin': []}
# for i in range(len(info_map_list)):
# if len(info_map_list[i]) != 0:
# table["goods_name"].append(info_map_list[i]["goods_name"])
# table["goods_id"].append(str(info_map_list[i]["goods_id"]))
# table["brand_name"].append(info_map_list[i]["brand_name"])
# table["goods_name_origin"].append(info_map_list[i][""])
# else:
# table["goods_name"].append('')
# table["goods_id"].append('')
# table["brand_name"].append('')
# df = pd.DataFrame(table)
# df.to_excel('ans.xlsx', sheet_name='Sheet1', index=False, startcol=3)
# 追加写
def
write_to_xlsx_append
(
info_map_list
:
list
):
info_map
=
{
'goods_name'
:
[],
'goods_id'
:
[],
'brand_name'
:
[],
'key_word'
:
[]}
info_map
=
{
'goods_name'
:
[],
'goods_id'
:
[],
'brand_name'
:
[],
'key_word'
:
[]
,
'goods_name_origin'
:
[]
}
for
i
in
range
(
len
(
info_map_list
)):
if
len
(
info_map_list
[
i
])
!=
0
:
info_map
[
"goods_name"
]
.
append
(
info_map_list
[
i
][
"goods_name"
])
info_map
[
"goods_id"
]
.
append
(
str
(
info_map_list
[
i
][
"goods_id"
]))
info_map
[
"brand_name"
]
.
append
(
info_map_list
[
i
][
"brand_name"
])
info_map
[
"key_word"
]
.
append
(
info_map_list
[
i
][
"key_word"
])
info_map
[
"goods_name_origin"
]
.
append
(
info_map_list
[
i
][
"goods_name_origin"
])
else
:
info_map
[
"goods_name"
]
.
append
(
''
)
info_map
[
"goods_id"
]
.
append
(
''
)
info_map
[
"brand_name"
]
.
append
(
''
)
info_map
[
"key_word"
]
.
append
(
''
)
info_map
[
"goods_name_origin"
]
.
append
(
''
)
data
=
openpyxl
.
load_workbook
(
'jd.xlsx'
)
table
=
data
[
data
.
sheetnames
[
0
]]
table
.
cell
(
1
,
6
)
.
value
=
'goods_name'
table
.
cell
(
1
,
7
)
.
value
=
'goods_id'
table
.
cell
(
1
,
8
)
.
value
=
'brand_name'
table
.
cell
(
1
,
9
)
.
value
=
'key_word'
for
i
in
range
(
2
,
len
(
info_map_list
)
+
2
):
table
.
cell
(
i
,
6
)
.
value
=
info_map
[
'goods_name'
][
i
-
2
]
table
.
cell
(
i
,
7
)
.
value
=
info_map
[
'goods_id'
][
i
-
2
]
table
.
cell
(
i
,
8
)
.
value
=
info_map
[
'brand_name'
][
i
-
2
]
table
.
cell
(
i
,
9
)
.
value
=
info_map
[
'key_word'
][
i
-
2
]
table
.
cell
(
1
,
10
)
.
value
=
'goods_name_origin'
for
i
in
range
(
2
,
len
(
info_map_list
)
+
2
):
table
.
cell
(
i
,
6
)
.
value
=
info_map
[
'goods_name'
][
i
-
2
]
table
.
cell
(
i
,
7
)
.
value
=
info_map
[
'goods_id'
][
i
-
2
]
table
.
cell
(
i
,
8
)
.
value
=
info_map
[
'brand_name'
][
i
-
2
]
table
.
cell
(
i
,
9
)
.
value
=
info_map
[
'key_word'
][
i
-
2
]
table
.
cell
(
i
,
10
)
.
value
=
info_map
[
'goods_name_origin'
][
i
-
2
]
data
.
save
(
'jd.xlsx'
)
if
__name__
==
'__main__'
:
res
=
main
()
write_to_xlsx_append
(
res
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment