Commit 7a5224a6 by lichenggang

update ti

parent 834ae550
...@@ -34,7 +34,7 @@ def get_levels(): ...@@ -34,7 +34,7 @@ def get_levels():
for level2_dom in level1_dom[1].xpath('.//li/a'): for level2_dom in level1_dom[1].xpath('.//li/a'):
level2 = {} level2 = {}
level2['cat_name'] = level2_dom.xpath('./text()')[0] level2['cat_name'] = level2_dom.xpath('./text()')[0]
level2['url'] = domain + level2_dom.xpath('./@href')[0] level2['url'] = level2_dom.xpath('./@href')[0]
level2['level'] = 2 level2['level'] = 2
level2['islast'] = 1 level2['islast'] = 1
level1['level2_list'].append(level2) level1['level2_list'].append(level2)
......
#!/usr/bin/env python
# -*- coding:utf-8 -*-
\ No newline at end of file
from utils.base import Module_Base
a = Module_Base()
from concurrent.futures import ThreadPoolExecutor,as_completed
import re
from lxml import etree
import requests
exid_pat=re.compile('tiProductPathID = \"(.*)\"')
exector = ThreadPoolExecutor(max_workers=5)
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
# url="http://www.ti.com.cn/"
# resp=requests.get(url,headers=headers)
# dom=etree.HTML(resp.text)
# urls = dom.xpath('//div[@id="sub_products"]//div[@class="column"]//ul//a/@href')
# us = ['http:'+i for i in urls]
# print(us)
us=['http://www.ti.com.cn/zh-cn/amplifier-circuit/overview.html', 'http://www.ti.com.cn/zh-cn/audio-ic/overview.html', 'http://www.ti.com.cn/zh-cn/clock-and-timing/overview.html', 'http://www.ti.com.cn/zh-cn/data-converters/overview.html', 'http://www.ti.com.cn/zh-cn/die-wafer-services/overview.html', 'http://www.ti.com.cn/zh-cn/dlp-chip/overview.html', 'http://www.ti.com.cn/zh-cn/interface/overview.html', 'http://www.ti.com.cn/zh-cn/isolation/overview.html', 'http://www.ti.com.cn/zh-cn/logic-circuit/overview.html', 'http://www.ti.com.cn/zh-cn/microcontrollers/overview.html', 'http://www.ti.com.cn/zh-cn/motor-drivers/overview.html', 'http://www.ti.com.cn/zh-cn/power-management/overview.html', 'http://www.ti.com.cn/zh-cn/processors/overview.html', 'http://www.ti.com.cn/zh-cn/rf-microwave/overview.html', 'http://www.ti.com.cn/zh-cn/sensors/overview.html', 'http://www.ti.com.cn/zh-cn/switches-multiplexers/overview.html', 'http://www.ti.com.cn/zh-cn/wireless-connectivity/overview.html']
print(len(us))
def get_levels():
levels=[]
for i in us:
level1 = dict()
level1['url'] = i
level1['level'] = 1
text = requests.get(i, headers=headers).text
level1_dom = etree.HTML(text)
level1['cat_name'] = level1_dom.xpath('//h2[@data-lid="portalTitle"]/a/text()')[0]
level1['islast'] = 0 if level1_dom.xpath('.//ul') else 1
level1['parent_id'] = 0
level1['ext_fields'] = int(exid_pat.findall(text)[0].split('/')[-2])
level1['level2_list'] = []
level2_doms = level1_dom.xpath('//li[@class="ti-nav-level1"]')
for level2_dom in level2_doms:
level2=dict()
level2['cat_name'] = re.sub('\(\d+\)','',level2_dom.xpath('./a/text()')[0]).strip().replace(' ','')
level2['url'] = level2_dom.xpath('./a/@href')[0]
level2['level'] = 2
level2['islast'] = 0 if level2_dom.xpath('.//ul') else 1
# level2['parent_id'] = 0
level3_doms = level2_dom.xpath('./ul//li')
level2['level3_list'] = []
print('2级',level2['url'])
l2_text = requests.get(level2['url'], headers=headers).text
if '数字电位器' in level2['cat_name']:
level2['ext_fields'] = 1900
else:
level2['ext_fields'] = int(exid_pat.findall(l2_text)[0].split('/')[-2])
for level3_dom in level3_doms:
level3 = {}
level3['cat_name'] = re.sub('\(\d+\)','',level3_dom.xpath('./a/text()')[0]).strip().replace(' ','')
level3['url'] =level3_dom.xpath('./a/@href')[0]
level3['level'] = 3
level3['islast'] = 1
l3_text = requests.get(level3['url'], headers=headers).text
print('3级', level3['url'])
if 'Zigbee' in level3['cat_name']:
level3['ext_fields'] = 1624
else:
level3['ext_fields'] = int(exid_pat.findall(l3_text)[0].split('/')[-2])
level2['level3_list'].append(level3)
level1['level2_list'].append(level2)
levels.append(level1)
return levels
levels = get_levels()
print(levels)
\ No newline at end of file
#!/usr/bin/env python
# -*- coding:utf-8 -*-
\ No newline at end of file
from utils.base import Module_Base
domain='https://www.51dzw.com/'
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
}
a = Module_Base()
url = 'https://www.51dzw.com/'
resp = a.requests.get(url,headers=headers)
resp.encoding='gbk'
dom = a.etree.HTML(resp.text)
level1_doms=dom.xpath('//div[@class="mainLeft"]//ul/a')
def get_levels():
levels = []
for level1_dom in level1_doms:
level1=dict()
level1['cat_name'] = level1_dom.xpath('./text()')[0].strip()
level1['url'] = domain+level1_dom.xpath('./@href')[0]
level1['level'] = 1
level1['islast'] = 0
level1['parent_id'] = 0
level1['level2_list'] = []
resp = a.requests.get(level1['url'], headers=headers)
resp.encoding='gbk'
dom = a.etree.HTML(resp.text)
level2_doms = dom.xpath('//dd[@id="CateList"]//span/a')
print('一级分类',level1)
for level2_dom in level2_doms:
level2 = {}
level2['cat_name'] = level2_dom.xpath('./text()')[0].strip()
level2['url'] = domain + level2_dom.xpath('./@href')[0]
level2['level'] = 2
level2['islast'] = 1
level1['level2_list'].append(level2)
levels.append(level1)
return levels
levels = get_levels()
print(levels)
\ No newline at end of file
...@@ -19,18 +19,18 @@ class Updater(): ...@@ -19,18 +19,18 @@ class Updater():
result = result[0] if result else None result = result[0] if result else None
return result return result
def update(self, cat_id, url, islast, level, parent_id=0, is_show=1): def update(self, cat_id, url, islast, level, parent_id=0, is_show=1, ext_fields=''):
with self.operator.db.cursor() as cursor: with self.operator.db.cursor() as cursor:
sql = 'UPDATE {} SET url=%s,islast=%s,level=%s,is_show=%s, parent_id=%s where cat_id=%s'.format( sql = 'UPDATE {} SET url=%s,islast=%s,level=%s,is_show=%s, parent_id=%s, ext_fields=%s where cat_id=%s'.format(
self.table_name) self.table_name)
data = (url, islast, level, is_show, parent_id, cat_id) data = (url, islast, level, is_show, parent_id, ext_fields, cat_id)
cursor.execute(sql, data) cursor.execute(sql, data)
def insert(self, cat_name, url, islast, level, parent_id=0, is_show=1, sort_order=50, page_count=1): def insert(self, cat_name, url, islast, level, parent_id=0, is_show=1, sort_order=50, page_count=1, ext_fields=''):
with self.operator.db.cursor() as cursor: with self.operator.db.cursor() as cursor:
sql = 'INSERT into {}(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \ sql = 'INSERT into {}(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count,ext_fields) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'.format(self.table_name) '(%s,%s,%s,%s,%s,%s,%s,%s,%s)'.format(self.table_name)
data = (cat_name, parent_id, sort_order, is_show, url, islast, level, page_count) data = (cat_name, parent_id, sort_order, is_show, url, islast, level, page_count,ext_fields)
cursor.execute(sql, data) cursor.execute(sql, data)
cursor.execute("select max(cat_id) from {}".format(self.table_name)) cursor.execute("select max(cat_id) from {}".format(self.table_name))
cat_id = cursor.fetchone()[0] cat_id = cursor.fetchone()[0]
...@@ -41,22 +41,26 @@ class Updater(): ...@@ -41,22 +41,26 @@ class Updater():
for level1 in levels: for level1 in levels:
cat1_id = self.get_cat_id(level1['cat_name']) cat1_id = self.get_cat_id(level1['cat_name'])
if cat1_id: if cat1_id:
self.update(cat1_id, level1['url'], level1['islast'], level1['level']) self.update(cat1_id, level1['url'], level1['islast'], level1['level'],ext_fields=level1['ext_fields'])
else: else:
cat1_id = self.insert(level1['cat_name'], level1['url'], level1['islast'], level1['level'], cat1_id = self.insert(level1['cat_name'], level1['url'], level1['islast'], level1['level'],
level1['parent_id']) level1['parent_id'],ext_fields=level1['ext_fields'])
print('一级分类', level1['cat_name']) print('一级分类', level1['cat_name'])
for level2 in level1.get('level2_list'): for level2 in level1.get('level2_list'):
if level2['islast']:
cat2_id = self.get_cat_id(level2['cat_name']) cat2_id = self.get_cat_id(level2['cat_name'])
if cat2_id: if cat2_id:
self.update(cat2_id, level2['url'], level2['islast'], level2['level'], cat1_id) self.update(cat2_id, level2['url'], level2['islast'], level2['level'], cat1_id,ext_fields=level2['ext_fields'])
else: else:
self.insert(level2['cat_name'], level2['url'], level2['islast'], level2['level'], cat1_id) cat2_id = self.insert(level2['cat_name'], level2['url'], level2['islast'], level2['level'], cat1_id,ext_fields=level2['ext_fields'])
print('二级分类', level2['cat_name']) print('二级分类', level2['cat_name'])
if not level2['islast']:
for level3 in level2.get('level3_list'):
cat3_id = self.get_cat_id(level3['cat_name'])
if cat3_id:
self.update(cat3_id, level3['url'], level3['islast'], level3['level'], cat2_id,ext_fields=level3['ext_fields'])
else: else:
pass # TODO 增加三级分类 self.insert(level3['cat_name'], level3['url'], level3['islast'], level3['level'], cat2_id,ext_fields=level3['ext_fields'])
print('三级分类', level3['cat_name'])
self.operator.db.commit() self.operator.db.commit()
def insert_once(self, cat_name, url, islast, level, parent_id=0, page_count=1): def insert_once(self, cat_name, url, islast, level, parent_id=0, page_count=1):
......
...@@ -26,7 +26,8 @@ HOST_SET = { ...@@ -26,7 +26,8 @@ HOST_SET = {
# 'test_szlc': '' # 'test_szlc': ''
} }
UP_SET = { UP_SET = {
'test': ('ichunt', 'ichunt'), # 'test': ('ichunt', 'ichunt'),
'test': ('bigdata', 'bigdata2019'),
'produce': ('bigdata', 'bdYm2yy2mmyzlmlly'), 'produce': ('bigdata', 'bdYm2yy2mmyzlmlly'),
'test_szlc': ('szlc', 'szlc#zsyM') 'test_szlc': ('szlc', 'szlc#zsyM')
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment