Commit d77d64f4 by lichenggang

Initial commit

parents
No preview for this file type
No preview for this file type
No preview for this file type
import re, requests
from lxml import etree
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('chip1stop')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.chip1stop.com",
"Referer": "https://www.chip1stop.com/CHN/zh",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
domain = 'https://www.chip1stop.com'
catelist_url = 'https://www.chip1stop.com/CHN/zh/category'
item_name_pattern = re.compile(r'([^(]*)(')
item_num_pattern = re.compile(r'[^(]*((\d*,*\d*,*\d+))')
var = {
'cat_name': None,
'parent_id': None,
'sort_order': 50,
"is_show": 1,
"url": None,
"islast": None,
"level": None,
"page_count": 0
}
def get_level_1():
resp = requests.get(catelist_url, headers=headers).text
dom = etree.HTML(resp)
# level_1doms = dom.xpath('//ul[@class="m-flex m-flex--c4 m-flex--g40 m-mt categoryUlClick"]//li')
# for li in level_1doms:
# level_1title = li.xpath('.//span[@class="m-text-16 m-font-b m-color-bl m-mt-s"]/text()')[0]
# level_1href = domain + li.xpath('./a/@href')[0]
# print(level_1title,level_1href)
level_doms = dom.xpath('//div[@class="clear box_t10 categoryanchorTop categoryanchorHeight"]//section')
for se in level_doms:
level_1 = {}
level_1['cat_name'] = se.xpath('.//h3/a/text()')[0].strip()
level_1['parent_id'] = 0
level_1['sort_order'] = 50
level_1['is_show'] = 1
level_1['url'] = domain + se.xpath('.//h3/a/@href')[0]
level_1['islast'] = 0
level_1['level'] = 1
level_1['page_count'] = 0
level_1_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_1_data = (
level_1['cat_name'], level_1['parent_id'], level_1['sort_order'], level_1['is_show'], level_1['url'],
level_1['islast'], level_1['level'], level_1['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_1_sql,level_1_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_1_incby_id = cursor.fetchone()[0]
print('一级分类'+str(level_1_incby_id))
level_2doms = se.xpath('.//div[@class="categoryListDl clearfix"]')
for level_2dom in level_2doms:
level_2 = {}
level_2['cat_name'] = level_2dom.xpath('.//h4//a')[0].xpath('string(.)').strip()
level_2['parent_id'] = level_1_incby_id
level_2['sort_order'] = 50
level_2['is_show'] = 1
level_2['url'] =domain + level_2dom.xpath('.//h4//a/@href')[0]
level_2['islast'] = 0
level_2['level'] = 2
level_2['page_count'] = 0
level_2_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_2_data = (
level_2['cat_name'], level_2['parent_id'], level_2['sort_order'], level_2['is_show'], level_2['url'],
level_2['islast'], level_2['level'], level_2['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_2_sql, level_2_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_2_incby_id =cursor.fetchone()[0]
print('二级分类' + str(level_2_incby_id))
level_3doms = level_2dom.xpath('.//ul//li')
for level_3dom in level_3doms:
level_3 = {}
level_3['cat_name'] = item_name_pattern.findall(level_3dom.xpath('.//a/text()')[0].strip())[0]
level_3['parent_id'] = level_2_incby_id
level_3['sort_order'] = 50
level_3['is_show'] = 1
level_3['url'] = domain + level_3dom.xpath('.//a/@href')[0]
level_3['islast'] = 1
level_3['level'] = 3
level_3['page_count'] = 0
level_3_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_3_data = (
level_3['cat_name'], level_3['parent_id'], level_3['sort_order'], level_3['is_show'],
level_3['url'],
level_3['islast'], level_3['level'], level_3['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_3_sql, level_3_data)
operator.db.commit()
# level_3num = item_num_pattern.findall(level_3title)[0].replace(',', '')
# print(level_3name,level_3num,level_3href)
if __name__ == '__main__':
get_level_1()
import re, requests
from lxml import etree
from utils.mysqlopera import MySqlOperator
operator = MySqlOperator('chip1stop')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.chip1stop.com",
"Referer": "https://www.chip1stop.com/CHN/zh",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
domain = 'https://www.chip1stop.com'
catelist_url = 'https://www.chip1stop.com/CHN/zh/stock'
item_name_pattern = re.compile(r'([^\[]*)')
item_num_pattern = re.compile(r'\[([0-9,]*)\]')
var = {
'cat_name': None,
'parent_id': None,
'sort_order': 50,
"is_show": 1,
"url": None,
"islast": None,
"level": None,
"page_count": 0
}
def get_level_1():
resp = requests.get(catelist_url, headers=headers).text
dom = etree.HTML(resp)
# level_doms = dom.xpath('//div[@class="clear box_t10 categoryanchorTop categoryanchorHeight"]//section')
level_doms = dom.xpath('//div[@class="m-news-tab-content js-tab-target is-active"]//section[not(@id="a-product-01")]')
for se in level_doms:
level_1 = {}
level_1['cat_name'] = se.xpath('.//h3/a/text()')[0].strip()
level_1['parent_id'] = 0
level_1['sort_order'] = 50
level_1['is_show'] = 1
level_1['url'] = domain + se.xpath('.//h3/a/@href')[0]
level_1['islast'] = 0
level_1['level'] = 1
level_1['page_count'] = 0
level_1_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_1_data = (
level_1['cat_name'], level_1['parent_id'], level_1['sort_order'], level_1['is_show'], level_1['url'],
level_1['islast'], level_1['level'], level_1['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_1_sql,level_1_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_1_incby_id = cursor.fetchone()[0]
print('一级分类:'+str(level_1['cat_name']))
level_2doms = se.xpath('.//h4[@class="m-bor-title m-text-16 m-mt"]')
for level_2dom in level_2doms:
level_2 = {}
level_2['cat_name'] = level_2dom.xpath('.//a')[0].xpath('string(.)').strip()
level_2['parent_id'] = level_1_incby_id
level_2['sort_order'] = 50
level_2['is_show'] = 1
level_2['url'] =domain + level_2dom.xpath('.//a/@href')[0]
level_2['islast'] = 0
level_2['level'] = 2
level_2['page_count'] = 0
level_2_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_2_data = (
level_2['cat_name'], level_2['parent_id'], level_2['sort_order'], level_2['is_show'], level_2['url'],
level_2['islast'], level_2['level'], level_2['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_2_sql, level_2_data)
operator.db.commit()
cursor.execute("select max(cat_id) from lie_category")
level_2_incby_id =cursor.fetchone()[0]
print('二级分类:' + str(level_2['cat_name']))
level_3doms = level_2dom.xpath('./following-sibling::ul[1]//a')
for level_3dom in level_3doms:
level_3 = {}
level_3['cat_name'] = item_name_pattern.findall(level_3dom.xpath('./text()')[0].strip())[0]
level_3['parent_id'] = level_2_incby_id
level_3['sort_order'] = 50
level_3['is_show'] = 1
level_3['url'] = domain + level_3dom.xpath('./@href')[0]
level_3['islast'] = 1
level_3['level'] = 3
level_3['page_count'] = 0
level_3_sql = 'INSERT into lie_category(cat_name,parent_id,sort_order,is_show,url,islast,level,page_count) values' \
'(%s,%s,%s,%s,%s,%s,%s,%s)'
level_3_data = (
level_3['cat_name'], level_3['parent_id'], level_3['sort_order'], level_3['is_show'],
level_3['url'],
level_3['islast'], level_3['level'], level_3['page_count'])
with operator.db.cursor() as cursor:
cursor.execute(level_3_sql, level_3_data)
operator.db.commit()
# level_3num = item_num_pattern.findall(level_3title)[0].replace(',', '')
# print(level_3name,level_3num,level_3href)
print('三级分类:' + level_3['cat_name'])
if __name__ == '__main__':
get_level_1()
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
def get_env():
if sys.platform in ('darwin', 'win32'):
return 'test'
else:
return 'produce'
ENV = get_env()
PLATFORM = {
'ar': 'arrow',
'dg': 'digikey'
}
HOST_SET = {
'test': '192.168.1.232',
'produce': '172.18.137.37'
}
MQ_HOST_SET = {
'test': '192.168.1.237',
'produce': '119.23.79.136'
}
MG_HOST_SET = {
'test': '192.168.1.237',
'produce': '172.18.137.23'
}
ER_HOST_SET = {
'test': '192.168.1.235',
'produce': '172.18.137.38'
}
UP_SET = {
'test': ('ichunt', 'ichunt'),
'produce': ('bigdata', 'bdYm2yy2mmyzlmlly')
}
def get_mysql_conf(db):
host = HOST_SET[ENV]
up = UP_SET[ENV]
conf = {
'host': host,
'port': 3306,
'user': up[0],
'password': up[1],
'db': db,
'charset': 'utf8'
}
return conf
def get_redis_conf():
conf = {
'host': 'localhost',
'port': 6379,
'db': 0,
}
if ENV == 'produce':
conf['password'] = 'icDb29mLy2s'
return conf
def get_er_redis_conf():
conf = {
'host': ER_HOST_SET[ENV],
'port': 6379,
'db': 0,
'password': 'icDb29mLy2s'
}
return conf
def get_mq_conf():
host = MQ_HOST_SET[ENV]
# host = '119.23.79.136'
conf = {
'host': host,
'user': 'huntadmin',
'password': 'jy2y2900'
}
return conf
def get_mongo_conf():
host = MG_HOST_SET[ENV]
conf = {
"host": host,
"port": 27017,
"database": 'ichunt',
"user": 'ichunt',
"password": 'huntmon66499'
}
return conf
all_hosts = {'ali_ml': ('60.205.217.219:', '47.100.98.66:', '119.23.204.20:', '119.23.230.246:'),
'ali_hk': ('47.244.26.18:',),
'ukd_ml': ('106.75.177.159:', '106.75.136.234:', '106.75.222.94:', '106.75.73.84:'),
'ukd_hk': ('103.210.21.90:',)}
This diff is collapsed. Click to expand it.
import json
import re
import time
from lxml import etree
from selenium import webdriver
from multiprocessing import Pool
import redis, os, traceback
from selenium.webdriver import DesiredCapabilities
from chip1stop.config import get_redis_conf, ENV
from chip1stop.mylogger import Mylogger
from chip1stop.rabbit_conn import producer
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('blink-settings=imagesEnabled=false')
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-images')
domain = 'https://www.chip1stop.com'
partid_pattern = re.compile('partId=(.*)')
pid_driver_map = dict()
logger = Mylogger('error.log').logger
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") #设置user-agent请求头
dcap["phantomjs.page.settings.loadImages"] = False #禁止加载图片
def gen_table_doms(url, cat_id):
pid = os.getpid()
if pid in pid_driver_map:
driver = pid_driver_map.get(pid)
else:
if ENV == 'test':
driver = webdriver.Chrome(options=chrome_options, executable_path='C:/Users/ICHUNT/Desktop/chromedriver.exe')
else:
driver = webdriver.PhantomJS(desired_capabilities=dcap)
pid_driver_map[pid] = driver
for page in range(1, 10000):
try:
if page == 1:
driver.get(url)
time.sleep(5)
else:
next_btn = driver.find_elements_by_xpath('//a[@id="searchFormForDisp:j_idt1211:nextPage"]')
if next_btn:
next_btn[0].click()
time.sleep(5)
else:
break
text = driver.page_source
if text.find('m-product-detail-spec') != -1:
prod = parse_text1(text)
prod['site_url'] = driver.current_url
prod['goods_name_style'] = partid_pattern.findall(driver.current_url)[0]
prod['goods_sn'] = prod['goods_name'] + '€€' + prod['goods_name_style']
prod['cat_id'] = cat_id
producer.send_task(json.dumps(prod))
print(prod)
else:
li = parse_text(text)
for good in li:
good['cat_id'] = cat_id
producer.send_task(json.dumps(good))
print(good)
except Exception:
err = traceback.format_exc()
logger.error(err[err.rfind('\n', 0, -2):].strip() + ' - page: ' + str(page) + ' - ' + 'url: ' + url)
def parse_text(text):
html_dom = etree.HTML(text)
table_doms = html_dom.xpath('//span[contains(@id,"searchFormForDisp:searchRepeat:")]/table/tbody/tr')
product_list = []
for tr in table_doms:
obj = {}
obj['goods_name'] = tr.xpath('.//span[@class="ellipsis-title"]/text()')[0]
obj['site_url'] = domain + tr.xpath('.//h3/a/@href')[0]
obj['goods_name_style'] = partid_pattern.findall(obj['site_url'])[0]
obj['goods_sn'] = obj['goods_name'] + '€€' + obj['goods_name_style']
obj['provider_name'] = tr.xpath('./td[1]//ul/li[1]/a/span/text()')[0]
provider_url = tr.xpath('./td[1]//ul/li[1]/a/@href')[0]
obj['provider_url'] = domain + provider_url if provider_url.startswith('/') else provider_url
obj['attributes'] = []
obj['goods_img'] = obj['goods_desc'] = obj['pdf_url'] = obj['goods_thumb'] = ''
brief = tr.xpath('.//div[@class="m-product-tbl-main-col__caption"]/p/span')
obj['goods_brief'] = brief[0].text if brief else ''
fm = tr.xpath('./td[2]/div/div/p[2]')
obj['from'] = fm[0].text.strip() if fm else ''
stock = tr.xpath('.//p[@class="m-font-b"][2]')
obj['goods_number'] = int(stock[0].text.replace('库存数:', '').replace(',', '')) if stock else 0
obj['min_buynum'] = tr.xpath('.//td[3]//input[2]/@value')[0] if tr.xpath('.//td[3]//input[2]/@value') else 1
obj['increment'] = tr.xpath('.//td[3]//input[3]/@value')[0] if tr.xpath('.//td[3]//input[3]/@value') else 1
price_doms = tr.xpath('.//td[4]//tbody//tr')
obj['prices'] = []
if price_doms:
for price_dom in price_doms:
per = dict()
per_price = price_dom.xpath('.//td')
per['purchases'] = int(per_price[0].xpath('./text()')[0].replace('+', '').replace(',', ''))
per['price'] = float(per_price[1].xpath('./text()')[0].replace('¥', '').replace(',', '').strip())
obj['prices'].append(per)
product_list.append(obj)
return product_list
def parse_text1(text):
html_dom = etree.HTML(text)
obj = dict()
obj['goods_name'] = html_dom.xpath('//h3[@class="m-text-24 m-font-b"]/text()')[0].strip()
obj['provider_name'] = html_dom.xpath('//ul[@class="m-text-16"]//li[1]//a[1]/text()')[0]
provider_url = html_dom.xpath('//ul[@class="m-text-16"]//li[1]//a[1]/@href')[0]
obj['provider_url'] = domain + provider_url if provider_url.startswith('/') else provider_url
obj['attributes'] = []
obj['goods_img'] = obj['goods_desc'] = obj['pdf_url'] = obj['goods_thumb'] = ''
brief = html_dom.xpath('//div[@class="m-product-detail-spec__r"]//ul[@class="m-text-14"][1]//li[1]/text()')
obj['goods_brief'] = brief[0].replace('产品概述:', '').strip() if brief else ''
prod_from = html_dom.xpath('//div[@class="m-flex__item"]//p[@class="m-text-13"]/text()')
obj['from'] = prod_from[0].strip() if prod_from else ''
stock = html_dom.xpath('//input[@name="zaikoSuu"]/@value')
obj['goods_number'] = int(stock[0].replace(",", "")) if stock else 0
moq_spq = html_dom.xpath(
'//div[@class="m-mt-xs m-flex m-flex--c2 m-flex--g20 m-text-14"]//div[@class="m-flex__item"][1]//ul//li[1]/text()')
if moq_spq:
obj['min_buynum'] = int(moq_spq[0].replace('MOQ/SPQ:', '').split('/')[0])
obj['increment'] = int(moq_spq[0].replace('MOQ/SPQ:', '').split('/')[1])
else:
obj['min_buynum'] = 1
obj['increment'] = 1
price_doms = html_dom.xpath('//table[@class="m-m-product-tbl-cart-tbl"]//tbody//tr')
obj['prices'] = []
if price_doms:
for price_dom in price_doms:
per = dict()
per_price = price_dom.xpath('.//td')
per['purchases'] = int(per_price[0].xpath('./text()')[0].replace('+', '').replace(',', ''))
per['price'] = float(per_price[1].xpath('./text()')[0].replace('¥', '').replace(',', '').strip())
obj['prices'].append(per)
return obj
def get_url():
pool = redis.ConnectionPool(**get_redis_conf())
r = redis.Redis(connection_pool=pool)
for _ in range(350):
cate_url = r.lpop('csc_elec_chip1stop_1')
if cate_url:
data = json.loads(cate_url.decode())
yield (data['req_key'], data['extra']['cat_id'])
else:
break
if __name__ == '__main__':
p = Pool(3)
for req in get_url():
url = req[0]
cat_id = req[1]
# for url in ['https://www.chip1stop.com/CHN/zh/view/searchResult/SearchResultWithClassCd?classCd=010233&classLv=3&dispAllFlg=true&UseSession=false']:
p.apply_async(gen_table_doms, args=(url, cat_id))
# gen_table_doms(url)
p.close()
p.join()
import logging
from logging import handlers
class Mylogger(object):
level_relations = {
'debug':logging.DEBUG,
'info':logging.INFO,
'warning':logging.WARNING,
'error':logging.ERROR,
'crit':logging.CRITICAL
}
def __init__(self,filename,level='info',when='D',backCount=3,fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
self.logger = logging.getLogger(filename)
format_str = logging.Formatter(fmt)
self.logger.setLevel(self.level_relations.get(level))
sh = logging.StreamHandler()
sh.setFormatter(format_str)
th = handlers.TimedRotatingFileHandler(filename=filename,when=when,backupCount=backCount,encoding='utf-8')
th.setFormatter(format_str)
self.logger.addHandler(sh)
self.logger.addHandler(th)
import json
import pika
from chip1stop.config import get_mq_conf
class Base:
def __init__(self, user, pwd, host, exchange=None, exchange_type=None):
credentials = pika.PlainCredentials(user, pwd)
self.conn = pika.BlockingConnection(pika.ConnectionParameters(host, credentials=credentials)) # 连接
self.ch = self.conn.channel() # 频道
self.exchange = exchange
if exchange and exchange_type:
self.ch.exchange_declare(exchange=exchange, exchange_type=exchange_type)
def send_task_fanout(self, body):
if self.exchange:
self.ch.basic_publish(exchange=self.exchange, routing_key='',
properties=pika.BasicProperties(delivery_mode=2),
body=body)
def send_task_direct(self, routing_key, body):
if self.exchange:
self.ch.basic_publish(exchange=self.exchange, routing_key=routing_key,
properties=pika.BasicProperties(delivery_mode=2),
body=body)
def __del__(self):
try:
self.ch.close()
self.conn.close()
except Exception:
pass
class Producer(Base):
def __init__(self, data_queue=None, durable=True, exchange=None, exchange_type=None):
config = get_mq_conf()
super().__init__(config['user'], config['password'],
config['host'], exchange=exchange, exchange_type=exchange_type)
self.data_queue = data_queue
self.durable = durable # 持久化
self.exchange = exchange
if self.data_queue is not None:
self.ch.queue_declare(queue=self.data_queue, durable=self.durable)
def send_task(self, body):
self.ch.basic_publish(exchange='', routing_key=self.data_queue,
properties=pika.BasicProperties(delivery_mode=2), # make message persistent
body=body)
def re_conn_channel(self):
self.ch.close()
self.ch = self.conn.channel()
self.ch.queue_declare(queue=self.data_queue, durable=self.durable)
producer = Producer('chip1stop_new_goods_store')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment