Commit e0f0da64 by lichenggang

init

parents
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (tornado_api_server) (2)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/update_cate.iml" filepath="$PROJECT_DIR$/.idea/update_cate.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
No preview for this file type
# from utils.mysqlopera import MySqlOperator
# sqler = MySqlOperator('element14')
# with sqler.db.cursor() as cursor:
# sql = 'select cat_id,cat_name,url from lie_category where cat_name like "http%"'
# cursor.execute(sql)
# a=cursor.fetchall()
# for per in a:
# sql = 'UPDATE lie_category SET url=%s,cat_name=%s where cat_id=%s '
# data=(per[1],per[2],per[0])
# cursor.execute(sql,data)
# sqler.db.commit()
# # c=[]
# # for per in a:
# # c.append(per[0])
File mode changed
No preview for this file type
import re, requests
from lxml import etree
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('chip1stop')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.chip1stop.com",
"Referer": "https://www.chip1stop.com/HKG/zh",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
domain = 'https://www.chip1stop.com'
catelist_url = 'https://www.chip1stop.com/HKG/zh/category'
item_name_pattern = re.compile(r'([^(]*)(')
item_num_pattern = re.compile(r'[^(]*((\d*,*\d*,*\d+))')
var = {
'cat_name': None,
'parent_id': None,
'sort_order': 50,
"is_show": 1,
"url": None,
"islast": None,
"level": None,
"page_count": 0
}
def get_level_1():
resp = requests.get(catelist_url, headers=headers).text
dom = etree.HTML(resp)
# level_1doms = dom.xpath('//ul[@class="m-flex m-flex--c4 m-flex--g40 m-mt categoryUlClick"]//li')
# for li in level_1doms:
# level_1title = li.xpath('.//span[@class="m-text-16 m-font-b m-color-bl m-mt-s"]/text()')[0]
# level_1href = domain + li.xpath('./a/@href')[0]
# print(level_1title,level_1href)
level_doms = dom.xpath('//div[@class="clear box_t10 categoryanchorTop categoryanchorHeight"]//section')
for se in level_doms:
level_1 = {}
level_1['cat_name'] = se.xpath('.//h3/a/text()')[0].strip()
level_1['parent_id'] = 0
level_1['sort_order'] = 50
level_1['is_show'] = 1
level_1['url'] = domain + se.xpath('.//h3/a/@href')[0]
level_1['islast'] = 0
level_1['level'] = 1
level_1['page_count'] = 0
level_1_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_1_data = (
level_1['cat_name'], level_1['parent_id'], level_1['sort_order'], level_1['is_show'], level_1['url'],
level_1['islast'], level_1['level'], level_1['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_1_sql,level_1_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_1_incby_id = cursor.fetchone()[0]
print('一级分类'+str(level_1_incby_id))
level_2doms = se.xpath('.//div[@class="categoryListDl clearfix"]')
for level_2dom in level_2doms:
level_2 = {}
level_2['cat_name'] = level_2dom.xpath('.//h4//a')[0].xpath('string(.)').strip()
level_2['parent_id'] = level_1_incby_id
level_2['sort_order'] = 50
level_2['is_show'] = 1
level_2['url'] =domain + level_2dom.xpath('.//h4//a/@href')[0]
level_2['islast'] = 0
level_2['level'] = 2
level_2['page_count'] = 0
level_2_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_2_data = (
level_2['cat_name'], level_2['parent_id'], level_2['sort_order'], level_2['is_show'], level_2['url'],
level_2['islast'], level_2['level'], level_2['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_2_sql, level_2_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_2_incby_id =cursor.fetchone()[0]
print('二级分类' + str(level_2_incby_id))
level_3doms = level_2dom.xpath('.//ul//li')
for level_3dom in level_3doms:
level_3 = {}
level_3['cat_name'] = item_name_pattern.findall(level_3dom.xpath('.//a/text()')[0].strip())[0]
level_3['parent_id'] = level_2_incby_id
level_3['sort_order'] = 50
level_3['is_show'] = 1
level_3['url'] = domain + level_3dom.xpath('.//a/@href')[0]
level_3['islast'] = 1
level_3['level'] = 3
level_3['page_count'] = 0
level_3_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_3_data = (
level_3['cat_name'], level_3['parent_id'], level_3['sort_order'], level_3['is_show'],
level_3['url'],
level_3['islast'], level_3['level'], level_3['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_3_sql, level_3_data)
operator.db.commit()
# level_3num = item_num_pattern.findall(level_3title)[0].replace(',', '')
# print(level_3name,level_3num,level_3href)
if __name__ == '__main__':
get_level_1()
import re, requests
from lxml import etree
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('chip1stop')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.chip1stop.com",
"Referer": "https://www.chip1stop.com/HKG/zh",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
domain = 'https://www.chip1stop.com'
catelist_url = 'https://www.chip1stop.com/HKG/zh/stock'
item_name_pattern = re.compile(r'([^\[]*)')
item_num_pattern = re.compile(r'\[([0-9,]*)\]')
var = {
'cat_name': None,
'parent_id': None,
'sort_order': 50,
"is_show": 1,
"url": None,
"islast": None,
"level": None,
"page_count": 0
}
def get_level_1():
resp = requests.get(catelist_url, headers=headers).text
dom = etree.HTML(resp)
# level_doms = dom.xpath('//div[@class="clear box_t10 categoryanchorTop categoryanchorHeight"]//section')
level_doms = dom.xpath('//div[@class="m-news-tab-content js-tab-target is-active"]//section[not(@id="a-product-01")]')
for se in level_doms:
level_1 = {}
level_1['cat_name'] = se.xpath('.//h3/a/text()')[0].strip()
level_1['parent_id'] = 0
level_1['sort_order'] = 50
level_1['is_show'] = 1
level_1['url'] = domain + se.xpath('.//h3/a/@href')[0]
level_1['islast'] = 0
level_1['level'] = 1
level_1['page_count'] = 0
level_1_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_1_data = (
level_1['cat_name'], level_1['parent_id'], level_1['sort_order'], level_1['is_show'], level_1['url'],
level_1['islast'], level_1['level'], level_1['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_1_sql,level_1_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_1_incby_id = cursor.fetchone()[0]
print('一级分类:'+str(level_1['cat_name']))
level_2doms = se.xpath('.//h4[@class="m-bor-title m-text-16 m-mt"]')
for level_2dom in level_2doms:
level_2 = {}
level_2['cat_name'] = level_2dom.xpath('.//a')[0].xpath('string(.)').strip()
level_2['parent_id'] = level_1_incby_id
level_2['sort_order'] = 50
level_2['is_show'] = 1
level_2['url'] =domain + level_2dom.xpath('.//a/@href')[0]
level_2['islast'] = 0
level_2['level'] = 2
level_2['page_count'] = 0
level_2_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_2_data = (
level_2['cat_name'], level_2['parent_id'], level_2['sort_order'], level_2['is_show'], level_2['url'],
level_2['islast'], level_2['level'], level_2['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_2_sql, level_2_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_2_incby_id =cursor.fetchone()[0]
print('二级分类:' + str(level_2['cat_name']))
level_3doms = level_2dom.xpath('./following-sibling::ul[1]//a')
for level_3dom in level_3doms:
level_3 = {}
level_3['cat_name'] = item_name_pattern.findall(level_3dom.xpath('./text()')[0].strip())[0]
level_3['parent_id'] = level_2_incby_id
level_3['sort_order'] = 50
level_3['is_show'] = 1
level_3['url'] = domain + level_3dom.xpath('./@href')[0]
level_3['islast'] = 1
level_3['level'] = 3
level_3['page_count'] = 0
level_3_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_3_data = (
level_3['cat_name'], level_3['parent_id'], level_3['sort_order'], level_3['is_show'],
level_3['url'],
level_3['islast'], level_3['level'], level_3['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_3_sql, level_3_data)
operator.db.commit()
# level_3num = item_num_pattern.findall(level_3title)[0].replace(',', '')
# print(level_3name,level_3num,level_3href)
print('三级分类:' + level_3['cat_name'])
if __name__ == '__main__':
get_level_1()
from utils.base import Module_Base
url = 'https://hk.element14.com/browse-for-products'
# url='http://httpbin.org/ip'
a = Module_Base()
headers = {
"authority": "cn.element14.com",
"method": "GET",
"path": "/browse-for-products",
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"referer": "https://cn.element14.com/",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
}
resp = a.requests.session().get(url, headers=headers).text
doms = a.etree.HTML(resp)
cate_pagecount_pattern = a.re.compile('')
content_sections = doms.xpath('//div[@class="categoryContainer"]//section')
levels = []
for content_dom in content_sections:
level1 = dict()
level1['cat_name'] = content_dom.xpath('.//div[@class="catHeaderWrapper"]//h2//a/text()')[0].strip()
level1['url'] = content_dom.xpath('.//div[@class="catHeaderWrapper"]//h2//a/@href')[0].strip()
level1['level'] = 1
level1['islast'] = 0
level1['parent_id'] = 0
level1['level2_list'] = []
# print('一级分类',level1['cat_name'])
level2_doms = content_dom.xpath('.//nav//ul')
for level2_dom in level2_doms:
level2 = {}
level2['cat_name'] = level2_dom.xpath('.//li//a/text()')[0]
level2['url'] = level2_dom.xpath('.//li//a/@href')[0]
level2['level'] = 2
level2['islast'] = 1
level1['level2_list'].append(level2)
# print('二级分类',level2['cat_name'])
levels.append(level1)
File mode changed
#!encoding:utf-8
import requests
from lxml import etree
import re
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('szlc')
headers = {
"authority": "www.szlcsc.com",
"method": "GET",
"path": "/catalog.html",
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"if-modified-since": "Wed, 13 Mar 2019 02:15:00 GMT",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
# 拿到所有一级二级分类数据
def get_szlc_all():
# level_name_pattern = re.compile(r'\d+\.(.*)\(.*\)')
level_name_pattern = re.compile(r'([^\(]*) \(')
level_num_pattern = re.compile(r'[^\(]*\((\d+)\)')
level2_name_pattern = re.compile(r'([^\(]*)\(')
url = 'https://www.szlcsc.com/catalog.html'
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
dom_html = etree.HTML(resp.text)
# cates=dom_html.xpath('//div[@class="catalog_a"]//dt/a/text()')
doms_cate = dom_html.xpath('//div[@class="item"]')
all = []
for dom in doms_cate:
per_1_cate = {}
print(dom.xpath('./a/text()')[2])
per_1_cate['name'] = level_name_pattern.findall(dom.xpath('./a/text()')[2])[0].strip()
per_1_cate['num'] = int(level_num_pattern.findall(dom.xpath('./a/text()')[2])[0])
per_1_cate['url'] = dom.xpath('./a/@href')[0]
per_1_cate['level'] = 1
per_1_cate['islast'] = 0
per_1_cate['parent_id'] = 0
doms_2cate = dom.xpath('.//div[@class="child-item"]')
per_1_cate['2_list'] = []
for dom_cate_2 in doms_2cate:
per_2_cate = {}
per_2_cate['name'] = level2_name_pattern.findall(dom_cate_2.xpath('.//a/text()')[0])[0].strip()
per_2_cate['num'] = int(level_num_pattern.findall(dom_cate_2.xpath('.//a/text()')[0])[0])
per_2_cate['url'] = dom_cate_2.xpath('.//a/@href')[0]
per_1_cate['level'] = 2
per_1_cate['islast'] = 1
per_1_cate['2_list'].append(per_2_cate)
all.append(per_1_cate)
#
return all
# 数据库所有分类的is_show设置为0
def clean_db():
with operator.db.cursor() as cursor:
sql = "update lie_category set is_show = 0"
cursor.execute(sql)
operator.db.commit()
# 拿到所有旧的分类的名字和ID
def get_old_allname_and_cat_id():
with operator.db.cursor() as cursor:
sql = "select cat_id,cat_name from lie_category "
cursor.execute(sql)
old = cursor.fetchall()
return old
# 给一级分类数据找到或者生成新ID,给二级分类找到ID
def add_cat_id(all, old):
for i in all:
for d in old:
if i['name'] in d:
i['cat_id'] = d[0]
break
for c in i['2_list']:
for g in old:
if c['name'] in g:
c['cat_id'] = g[0]
with operator.db.cursor() as cursor:
for i in all:
if i.get('cat_id'):
sql = 'UPDATE lie_category SET parent_id=%s,is_show=%s, url=%s,islast=%s,level=%s WHERE cat_id=%s'
data = (0, 1, i['url'], 0, 1, i['cat_id'])
cursor.execute(sql, data)
else:
sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
data = (i["name"], 0, 50, 1, i["url"], 0, 1, 0)
cursor.execute(sql, data)
operator.db.commit()
return all
# 拿到新的一级分类的ID
def get_new_allname_and_cat_id():
with operator.db.cursor() as cursor:
sql = "select cat_id,cat_name from lie_category "
cursor.execute(sql)
new = cursor.fetchall()
return new
# 给一级分类添加ID
def add_more_cat_id(all_, new):
for i in all_:
for d in new:
if i['name'] in d:
i['cat_id'] = d[0]
break
return all_
# 给二级分类添加parent_id,生成自己的新ID
def over(all_):
with operator.db.cursor() as cursor:
for a_1 in all_:
for s in a_1['2_list']:
if s.get('cat_id'):
sql = 'UPDATE lie_category SET parent_id=%s,is_show=%s, url=%s,islast=%s,level=%s WHERE cat_id=%s'
data = (a_1['cat_id'], 1, s['url'], 1, 2, s['cat_id'])
cursor.execute(sql, data)
else:
sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
data = (s["name"], a_1['cat_id'], 50, 1, s["url"], 1, 2, 0)
cursor.execute(sql, data)
operator.db.commit()
if __name__ == '__main__':
clean_db()
old = get_old_allname_and_cat_id()
all = get_szlc_all()
all_ = add_cat_id(all, old)
new = get_new_allname_and_cat_id()
all_ = add_more_cat_id(all_, new)
over(all_)
import sys
from updateopera import Updater
import importlib
if __name__ == '__main__':
plat_name = sys.argv[1]
module_name = 'plat.{}.cate_stock'.format(plat_name)
module = importlib.import_module(module_name)
updater = Updater(plat_name)
updater.up(module.levels)
from utils.mysqlopera import MySqlOperator
class Updater():
def __init__(self, db_name):
self.operator = MySqlOperator(db_name)
def clean_db(self):
with self.operator.db.cursor() as cursor:
sql = "update lie_category set is_show = 0"
cursor.execute(sql)
def get_cat_id(self, cat_name):
with self.operator.db.cursor() as cursor:
sql = 'select cat_id from lie_category where cat_name =%s'
cursor.execute(sql, cat_name)
result = cursor.fetchone()
result = result[0] if result else None
return result
def update(self, cat_id, url, islast, level, parent_id=0,is_show=1):
with self.operator.db.cursor() as cursor:
sql = 'UPDATE lie_category SET url=%s,islast=%s,level=%s,is_show=%s, parent_id=%s where cat_id=%s'
data=(url,islast, level, is_show,cat_id,parent_id)
cursor.execute(sql, data)
def insert(self, cat_name, url, islast, level, parent_id=0, is_show=1, sort_order=50, page_count=1):
with self.operator.db.cursor() as cursor:
sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
data = (cat_name, parent_id, sort_order, is_show, url, islast, level, page_count)
cursor.execute(sql, data)
cursor.execute("select max(cat_id) from lie_category")
cat_id=cursor.fetchone()[0]
return cat_id
def up(self,levels):
self.clean_db()
for level1 in levels:
cat1_id=self.get_cat_id(level1['cat_name'])
if cat1_id:
self.update(cat1_id,level1['url'],level1['islast'],level1['level'])
else:
cat1_id = self.insert(level1['cat_name'], level1['url'], level1['islast'], level1['level'], level1['parent_id'])
print('一级分类',level1['cat_name'])
for level2 in level1.get('level2_list'):
if level2['islast']:
cat2_id=self.get_cat_id(level2['cat_name'])
if cat2_id:
self.update(cat2_id,level2['url'],level2['islast'],level2['level'],cat1_id)
else:
self.insert(level2['cat_name'],level2['url'],level2['islast'],level2['level'],cat1_id)
print('二级分类',level2['cat_name'])
else:
pass #TODO 增加三级分类
self.operator.db.commit()
\ No newline at end of file
No preview for this file type
No preview for this file type
import re, requests,json
from lxml import etree
from utils.mysqlopera import MySqlOperator
class Module_Base():
re=re
requests=requests
json=json
etree=etree
mysql_opera=MySqlOperator
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment