Commit 0fc3ae3e by lichenggang

init

parent c920f231
#!encoding:utf-8
import requests
from lxml import etree
import re
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('szlc')
headers = {
"authority": "www.szlcsc.com",
"method": "GET",
"path": "/catalog.html",
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"if-modified-since": "Wed, 13 Mar 2019 02:15:00 GMT",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
# 拿到所有一级二级分类数据
def get_szlc_all():
# level_name_pattern = re.compile(r'\d+\.(.*)\(.*\)')
level_name_pattern = re.compile(r'([^\(]*) \(')
level_num_pattern = re.compile(r'[^\(]*\((\d+)\)')
level2_name_pattern = re.compile(r'([^\(]*)\(')
url = 'https://www.szlcsc.com/catalog.html'
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
dom_html = etree.HTML(resp.text)
# cates=dom_html.xpath('//div[@class="catalog_a"]//dt/a/text()')
doms_cate = dom_html.xpath('//div[@class="item"]')
all = []
for dom in doms_cate:
per_1_cate = {}
print(dom.xpath('./a/text()')[2])
per_1_cate['name'] = level_name_pattern.findall(dom.xpath('./a/text()')[2])[0].strip()
per_1_cate['num'] = int(level_num_pattern.findall(dom.xpath('./a/text()')[2])[0])
per_1_cate['url'] = dom.xpath('./a/@href')[0]
per_1_cate['level'] = 1
per_1_cate['islast'] = 0
per_1_cate['parent_id'] = 0
doms_2cate = dom.xpath('.//div[@class="child-item"]')
per_1_cate['2_list'] = []
for dom_cate_2 in doms_2cate:
per_2_cate = {}
per_2_cate['name'] = level2_name_pattern.findall(dom_cate_2.xpath('.//a/text()')[0])[0].strip()
per_2_cate['num'] = int(level_num_pattern.findall(dom_cate_2.xpath('.//a/text()')[0])[0])
per_2_cate['url'] = dom_cate_2.xpath('.//a/@href')[0]
per_1_cate['level'] = 2
per_1_cate['islast'] = 1
per_1_cate['2_list'].append(per_2_cate)
all.append(per_1_cate)
#
return all
# 数据库所有分类的is_show设置为0
def clean_db():
with operator.db.cursor() as cursor:
sql = "update lie_category set is_show = 0"
cursor.execute(sql)
operator.db.commit()
# 拿到所有旧的分类的名字和ID
def get_old_allname_and_cat_id():
with operator.db.cursor() as cursor:
sql = "select cat_id,cat_name from lie_category "
cursor.execute(sql)
old = cursor.fetchall()
return old
# 给一级分类数据找到或者生成新ID,给二级分类找到ID
def add_cat_id(all, old):
for i in all:
for d in old:
if i['name'] in d:
i['cat_id'] = d[0]
break
for c in i['2_list']:
for g in old:
if c['name'] in g:
c['cat_id'] = g[0]
with operator.db.cursor() as cursor:
for i in all:
if i.get('cat_id'):
sql = 'UPDATE lie_category SET parent_id=%s,is_show=%s, url=%s,islast=%s,level=%s WHERE cat_id=%s'
data = (0, 1, i['url'], 0, 1, i['cat_id'])
cursor.execute(sql, data)
else:
sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
data = (i["name"], 0, 50, 1, i["url"], 0, 1, 0)
cursor.execute(sql, data)
operator.db.commit()
return all
# 拿到新的一级分类的ID
def get_new_allname_and_cat_id():
with operator.db.cursor() as cursor:
sql = "select cat_id,cat_name from lie_category "
cursor.execute(sql)
new = cursor.fetchall()
return new
# 给一级分类添加ID
def add_more_cat_id(all_, new):
for i in all_:
for d in new:
if i['name'] in d:
i['cat_id'] = d[0]
break
return all_
# 给二级分类添加parent_id,生成自己的新ID
def over(all_):
with operator.db.cursor() as cursor:
for a_1 in all_:
for s in a_1['2_list']:
if s.get('cat_id'):
sql = 'UPDATE lie_category SET parent_id=%s,is_show=%s, url=%s,islast=%s,level=%s WHERE cat_id=%s'
data = (a_1['cat_id'], 1, s['url'], 1, 2, s['cat_id'])
cursor.execute(sql, data)
else:
sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
data = (s["name"], a_1['cat_id'], 50, 1, s["url"], 1, 2, 0)
cursor.execute(sql, data)
operator.db.commit()
if __name__ == '__main__':
clean_db()
old = get_old_allname_and_cat_id()
all = get_szlc_all()
all_ = add_cat_id(all, old)
new = get_new_allname_and_cat_id()
all_ = add_more_cat_id(all_, new)
over(all_)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment