Commit 61152668 by 岳巧源

add new rex

parent a85343c1
Showing with 52 additions and 31 deletions
...@@ -45,7 +45,7 @@ query = { ...@@ -45,7 +45,7 @@ query = {
} }
query2 = { query2 = {
"_source":["goods_id","goods_name","brand_name"], "_source": ["goods_id", "goods_name", "brand_name", "goods_name_origin"],
"query": { "query": {
"bool": { "bool": {
"must": [ "must": [
...@@ -89,10 +89,12 @@ black_list = {"eastsheep", "ABB", "BRsanyuan", "JCSTRONG", "TECHNOLOGY", ...@@ -89,10 +89,12 @@ black_list = {"eastsheep", "ABB", "BRsanyuan", "JCSTRONG", "TECHNOLOGY",
"HIKVISION", "HIKVISION",
"UGREEN", "xiaovv", "maxhub", "SAMSUNG", "SAMZHE", "ALINX", "Xilinx", "Lenovo", "MeeTap", "BCNET", "UGREEN", "xiaovv", "maxhub", "SAMSUNG", "SAMZHE", "ALINX", "Xilinx", "Lenovo", "MeeTap", "BCNET",
"LANGQING", "LANGQING",
"ORICO", "HUB", "Panasonic", "SDKELI", "10m", "9m", "8m", "7m", "6m", "5m", "4m", "3m", "2m", "1m" "ORICO", "HUB", "Panasonic", "SDKELI", "10m", "9m", "8m", "7m", "6m", "5m", "4m", "3m", "2m", "1m",
"TI", "HX", "Isabellenhuette",
} }
count = 0 count = 0
# 如果是采用单线程的模式跑这个脚本,解析完4000多行数据恐怕要1个小时了 所以还是采用多进程的方式吧 用四个脚本同时解析一个文件的不同行 # 如果是采用单线程的模式跑这个脚本,解析完4000多行数据恐怕要1个小时了 所以还是采用多进程的方式吧 用四个脚本同时解析一个文件的不同行
def main(): def main():
df = pd.read_excel('jd.xlsx') df = pd.read_excel('jd.xlsx')
...@@ -101,7 +103,7 @@ def main(): ...@@ -101,7 +103,7 @@ def main():
result = [] result = []
goods_name_list = [] goods_name_list = []
for i in range(len(data)): for i in range(len(data)):
# for debug, don't forget to delete this line. # for debug, don't forget to delete this line.
sku_name_str = data[i][0] sku_name_str = data[i][0]
res_tmp = match_sku(sku_name_str) res_tmp = match_sku(sku_name_str)
ans.append(res_tmp) ans.append(res_tmp)
...@@ -111,7 +113,7 @@ def main(): ...@@ -111,7 +113,7 @@ def main():
for j in range(len(ans[i])): for j in range(len(ans[i])):
tmp_goods_name = ans[i][j] tmp_goods_name = ans[i][j]
tmp_query = query2 tmp_query = query2
new_str = tmp_goods_name.replace('/', '').replace('-', '') new_str = tmp_goods_name.replace('/', '').replace('-', '').replace('.', '').replace(',', '')
tmp_query["query"]["bool"]["must"][0]["match"]["goods_name"] = new_str tmp_query["query"]["bool"]["must"][0]["match"]["goods_name"] = new_str
tmp_map = process(tmp_query, tmp_goods_name) tmp_map = process(tmp_query, tmp_goods_name)
if len(tmp_map) != 0: if len(tmp_map) != 0:
...@@ -129,7 +131,8 @@ def main(): ...@@ -129,7 +131,8 @@ def main():
index = k index = k
result.append(score_in_one_row[index]) result.append(score_in_one_row[index])
print(i, "------", score_in_one_row[index]) print(i, "------", score_in_one_row[index])
return result #返回一个列表,列表中要么为空字典,要么为 {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score} 字典 return result # 返回一个列表,列表中要么为空字典,要么为 {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score} 字典
def match_sku(s: str) -> []: def match_sku(s: str) -> []:
# s = "RG58/U射频连接线SMA公头转BNC公头延长线 SMA/BNC-JJ Q9/SMA-JJ RG58/U-SMA/BNC-JJ 2m" # s = "RG58/U射频连接线SMA公头转BNC公头延长线 SMA/BNC-JJ Q9/SMA-JJ RG58/U-SMA/BNC-JJ 2m"
...@@ -140,9 +143,20 @@ def match_sku(s: str) -> []: ...@@ -140,9 +143,20 @@ def match_sku(s: str) -> []:
if len(tmp) == 0 or tmp in black_list: if len(tmp) == 0 or tmp in black_list:
continue continue
# 正则表达式检测被空格分开的每个字符串,每个字符串 最多检测三个 英文字母,"/", "-", 数字0-9组成的子字符串 # 正则表达式检测被空格分开的每个字符串,每个字符串 最多检测三个 英文字母,"/", "-", 数字0-9组成的子字符串
# 此正则表达式暂时启用
# pattern = re.compile(
# r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*')
pattern = re.compile( pattern = re.compile(
r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*') r'[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*[^0-9a-zA-Z]*([0-9a-zA-Z][0-9a-zA-Z\\.,/-]{2,})*'
)
pat2 = re.compile(
r'[0-9a-zA-Z]{1,2}[^0-9a-zA-Z/-]{1,}([0-9a-zA-Z][0-9a-zA-Z/-]{2,})[^0-9a-zA-Z/-]*([0-9a-zA-Z][0-9a-zA-Z/-]{2,})*')
pat3 = re.compile(r'.*)([0-9a-zA-Z][0-9a-zA-Z/-]{2,}).*')
m = pattern.match(tmp) m = pattern.match(tmp)
if m is None:
m = pat2.match(tmp)
if m is None:
m = pat3.match(tmp)
if m is not None: if m is not None:
for k in range(len(m.groups())): for k in range(len(m.groups())):
tmp_str = m.groups()[k] tmp_str = m.groups()[k]
...@@ -153,7 +167,7 @@ def match_sku(s: str) -> []: ...@@ -153,7 +167,7 @@ def match_sku(s: str) -> []:
# 检查是否为纯数字,纯数字不会加入结果列表 # 检查是否为纯数字,纯数字不会加入结果列表
rex = re.compile("^[0-9]*$") rex = re.compile("^[0-9]*$")
mat = rex.search(tmp_str) mat = rex.search(tmp_str)
if mat is not None: if mat is not None and len(tmp_str) <= 4:
continue continue
res.append(tmp_str) res.append(tmp_str)
return res return res
...@@ -164,64 +178,72 @@ def post_to_elasticsearch(q: dict): ...@@ -164,64 +178,72 @@ def post_to_elasticsearch(q: dict):
ans = response.json() ans = response.json()
return ans return ans
def process(query_str: dict, key_word: str): def process(query_str: dict, key_word: str):
res = post_to_elasticsearch(query_str) res = post_to_elasticsearch(query_str)
if res["hits"]["max_score"] is not None and res["hits"]["max_score"] > 80: if res["hits"]["max_score"] is not None and res["hits"]["max_score"] > 80:
for i in range(len(res["hits"]["hits"])): for i in range(len(res["hits"]["hits"])):
if res["hits"]["hits"][i]["_score"] == res["hits"]["max_score"]: if res["hits"]["hits"][i]["_score"] == res["hits"]["max_score"]:
_score = res["hits"]["hits"][i]["_score"] _score = res["hits"]["hits"][i]["_score"]
_goods_name = res["hits"]["hits"][i]["_source"]["goods_name"] _goods_name = res["hits"]["hits"][i]["_source"]["goods_name"]
_goods_id = res["hits"]["hits"][i]["_source"]["goods_id"] _goods_id = res["hits"]["hits"][i]["_source"]["goods_id"]
_brand_name = res["hits"]["hits"][i]["_source"]["brand_name"] _brand_name = res["hits"]["hits"][i]["_source"]["brand_name"]
return {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score, "key_word": key_word} _goods_name_origin = res["hits"]["hits"][i]["_source"]["goods_name_origin"]
return {"goods_name": _goods_name, "goods_id": _goods_id, "brand_name": _brand_name, "score": _score,
"key_word": key_word, "goods_name_origin": _goods_name_origin}
return {} return {}
# 覆盖写 # 覆盖写
def write_to_xlsx(info_map_list: list): # def write_to_xlsx(info_map_list: list):
table = {'goods_name': [], 'goods_id': [], 'brand_name': []} # table = {'goods_name': [], 'goods_id': [], 'brand_name': [], 'goods_name_origin': []}
for i in range(len(info_map_list)): # for i in range(len(info_map_list)):
if len(info_map_list[i]) != 0: # if len(info_map_list[i]) != 0:
table["goods_name"].append(info_map_list[i]["goods_name"]) # table["goods_name"].append(info_map_list[i]["goods_name"])
table["goods_id"].append(str(info_map_list[i]["goods_id"])) # table["goods_id"].append(str(info_map_list[i]["goods_id"]))
table["brand_name"].append(info_map_list[i]["brand_name"]) # table["brand_name"].append(info_map_list[i]["brand_name"])
else: # table["goods_name_origin"].append(info_map_list[i][""])
table["goods_name"].append('') # else:
table["goods_id"].append('') # table["goods_name"].append('')
table["brand_name"].append('') # table["goods_id"].append('')
df = pd.DataFrame(table) # table["brand_name"].append('')
df.to_excel('ans.xlsx', sheet_name='Sheet1', index=False, startcol=3) # df = pd.DataFrame(table)
# df.to_excel('ans.xlsx', sheet_name='Sheet1', index=False, startcol=3)
# 追加写 # 追加写
def write_to_xlsx_append(info_map_list: list): def write_to_xlsx_append(info_map_list: list):
info_map = {'goods_name': [], 'goods_id': [], 'brand_name': [], 'key_word': []} info_map = {'goods_name': [], 'goods_id': [], 'brand_name': [], 'key_word': [], 'goods_name_origin': []}
for i in range(len(info_map_list)): for i in range(len(info_map_list)):
if len(info_map_list[i]) != 0: if len(info_map_list[i]) != 0:
info_map["goods_name"].append(info_map_list[i]["goods_name"]) info_map["goods_name"].append(info_map_list[i]["goods_name"])
info_map["goods_id"].append(str(info_map_list[i]["goods_id"])) info_map["goods_id"].append(str(info_map_list[i]["goods_id"]))
info_map["brand_name"].append(info_map_list[i]["brand_name"]) info_map["brand_name"].append(info_map_list[i]["brand_name"])
info_map["key_word"].append(info_map_list[i]["key_word"]) info_map["key_word"].append(info_map_list[i]["key_word"])
info_map["goods_name_origin"].append(info_map_list[i]["goods_name_origin"])
else: else:
info_map["goods_name"].append('') info_map["goods_name"].append('')
info_map["goods_id"].append('') info_map["goods_id"].append('')
info_map["brand_name"].append('') info_map["brand_name"].append('')
info_map["key_word"].append('') info_map["key_word"].append('')
info_map["goods_name_origin"].append('')
data = openpyxl.load_workbook('jd.xlsx') data = openpyxl.load_workbook('jd.xlsx')
table = data[data.sheetnames[0]] table = data[data.sheetnames[0]]
table.cell(1, 6).value = 'goods_name' table.cell(1, 6).value = 'goods_name'
table.cell(1, 7).value = 'goods_id' table.cell(1, 7).value = 'goods_id'
table.cell(1, 8).value = 'brand_name' table.cell(1, 8).value = 'brand_name'
table.cell(1, 9).value = 'key_word' table.cell(1, 9).value = 'key_word'
for i in range(2, len(info_map_list)+2): table.cell(1, 10).value = 'goods_name_origin'
table.cell(i, 6).value = info_map['goods_name'][i-2]
table.cell(i, 7).value = info_map['goods_id'][i-2] for i in range(2, len(info_map_list) + 2):
table.cell(i, 8).value = info_map['brand_name'][i-2] table.cell(i, 6).value = info_map['goods_name'][i - 2]
table.cell(i, 9).value = info_map['key_word'][i-2] table.cell(i, 7).value = info_map['goods_id'][i - 2]
table.cell(i, 8).value = info_map['brand_name'][i - 2]
table.cell(i, 9).value = info_map['key_word'][i - 2]
table.cell(i, 10).value = info_map['goods_name_origin'][i - 2]
data.save('jd.xlsx') data.save('jd.xlsx')
if __name__ == '__main__': if __name__ == '__main__':
res = main() res = main()
write_to_xlsx_append(res) write_to_xlsx_append(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment