您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Python - demo of various brand library Crawlers

編輯：Python

Solution

Source code ：https://gitee.com/shentuzhigang/mini-project/tree/master/brand-crawler

Vipshop

import json
import openpyxl
import requests
allBrandList = []
r = requests.get(
'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360552&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=wifi&width=414&height=622&hierarchy_id=107&category_id=&category_filter=&sale_for=&client_from=wxsmall')
json1 = r.json()
data1 = json1['data']['data']['tabs']
for tab in data1:
print(tab['categoryId'])
r2 = requests.get(
'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab_data/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360675&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=WIFI&width=750&height=500&pcmpWidth=510&hierarchy_id=107&category_id=' +
tab['categoryId'] + '&sale_for=')
json2 = r2.json()
data2 = json2['data']['data']
sectionList = data2['sectionList']
for section in sectionList:
if section['sectionType'] == 'category' and section['category']['name'] == ' Select brands ':
for brand in section['category']['children']:
B = dict(brand)
for b in brand:
if isinstance(brand[b],dict):
B = dict(B,**brand[b])
print(B)
allBrandList.append(B)
f = openpyxl.Workbook()
sheet1 = f.create_sheet('vip')
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
sheet1.cell(row=1, column=i).value = key
keys[key] = i
i += 1
for jkey in range(len(allBrandList)):
jk = jkey + 2
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('vip.xlsx')

Get something

import json
import openpyxl
load_dict = ''
with open("dewu.json", 'r') as load_f:
load_dict = json.load(load_f)
series = load_dict['data']['list']
allBrandList = []
for l in series:
dc = dict()
for d in l:
dc = dict(dc, **l[d])
print(dc)
allBrandList.append(dc)
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
keys[key] = i
i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('dewu')
for jkey in range(len(allBrandList)):
jk = jkey + 1
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
if cT == 0:
sheet1.cell(row=jk, column=keys[key]).value = key
else:
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('dewu.xlsx')

Cicada mother - Tiktok

Platform restrictions can only be accessed before 10000
Python2

# coding=utf-8
import json
import urllib2
f = open('data.json','w')
listAll = []
for i in range(1,100):
url = "https://api-service.chanmama.com/v2/home/brand/search?page="+ str(i) +"&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
print url
request = urllib2.Request(url)
# Imitate Firefox 
request.add_header("cookie", "***")
request.add_header("user-agent", "Mozilla/5.0")
response = urllib2.urlopen(request)
code = response.getcode()
content = response.read()
s = json.loads(content)
data = s['data']
list = data['list']
listAll.extend(list)
f.write(json.dumps(listAll))

Python3

# coding=utf-8
import json
import requests
f = open('data.json', 'w')
listAll = []
for i in range(1, 100):
url = "https://api-service.chanmama.com/v2/home/brand/search?page=" + str(
i) + "&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
print(url)
response = requests.get(url, headers={

"cookie": "***",
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
})
code = response.status_code
content = response.json()
data = content['data']
list = data['list']
listAll.extend(list)
# f.write(json.dumps(listAll))

Love inventory

Version of a

import requests
import json
cookie = '***'
headers = {

"cookie": cookie,
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
response = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
tagNos.append(tag['activityTagNo'])
for status in range(1, 3):
print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
res = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
params={

'tagNo': tag['activityTagNo'],
'status': status
},
headers=headers)
json1 = res.json()
if 'data' in json1:
data = json1['data']
brandLists = data['brandList']
for brandList in brandLists:
blist = brandList['brandList']
for b in blist:
allBrandList.append(b)
print(dict(b, **b['brandExtend']))
f = open('aikucun.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))

Version 2
Save to xlsx
Solve the mess

import re
import requests
import openpyxl
cookie = ''
headers = {

"cookie": cookie,
# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
response = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
tagNos.append(tag['activityTagNo'])
for status in range(1, 3):
print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
res = requests.get(
'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
params={

'tagNo': tag['activityTagNo'],
'status': status
},
headers=headers)
json1 = res.json()
if 'data' in json1:
data = json1['data']
brandLists = data['brandList']
for brandList in brandLists:
blist = brandList['brandList']
for b in blist:
print(dict(b, **b['brandExtend']))
if 'pcodelen' in b and b['pcodelen'] != '':
str0 = r'u"\u{0}'.format(r'\u'.join(re.findall(r'.{4}', str(b['pcodelen'])))) + '"'
print(str0)
str1 = str(eval(str0))
b['pinpaiming0'] = str1 + str(b['pinpaiming'])[len(str1):]
print(b['pinpaiming0'])
allBrandList.append(b)
print(sorted(dict(b, **b['brandExtend']).items(), key=lambda d: d[0]))
# f = open('aikucun.json', 'w', encoding='utf-8')
# f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
for key, value in allBrandList[jkey].items():
if key in keys:
continue
keys[key] = i
i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('aikucun')
for jkey in range(len(allBrandList)):
jk = jkey + 1
cT = 0
for key, value in allBrandList[jkey].items():
cT += 1
if cT == 0:
sheet1.cell(row=jk, column=keys[key]).value = key
else:
sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('aikucun.xlsx')

Good wardrobe

import requests
import json
headers = {

# Imitate Firefox 
"user-agent": "Mozilla/5.0"
}
allBrandList = []
for i in range(-300, 600):
for ty in [1, 5]:
response = requests.post('https://www.webuy.ai/sesame/hyk/shopCategory/brand/detail',
headers=headers,
json={

"exhibitionParkType": ty,
"categoryId": i,
"shopId": 3572,
"pageSize": 1000,
"pageNo": 1,
"isPageQuery": False
})
print(response.json())
json1 = response.json()
entry = json1['entry']
for b in entry:
print(b)
allBrandList.append(b)
f = open('webuy.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))

Fashion brand network

import requests
from bs4 import BeautifulSoup
import openpyxl
from openpyxl.drawing.image import Image
from PIL import Image as PILImage
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import threading
import time
f = openpyxl.Workbook()
sheet1 = f.create_sheet('chinasspp')
headers = [' The brand name ', ' Industry categories ', ' Corporate name ', ' contact number ', ' Company fax ', ' Official website ', ' Contact address ', ' Online Service ']
for index, name in enumerate(headers):
sheet1.cell(row=1, column=index + 1).value = name
count = 1
def parseDetail(no, link):
response = requests.get(link)
response.encoding = "gbk"
soup = BeautifulSoup(response.text, 'lxml')
print('no' + str(no))
for item in soup.select_one("#brand_info_ctl00_blink").select('li'):
key = item.text.split('：')[0]
value = item.text.split('：')[1]
# print(item)
# print(key + ':' + value)
# print(headers.index(key))
sheet1.cell(row=no, column=headers.index(key) + 1).value = value.encode('utf-8').decode('utf-8')
if key == ' contact number ':
url1 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
img1 = PILImage.open(BytesIO(requests.get(url1).content))
sheet1.add_image(Image(img1), chr(ord("A") + headers.index(key)) + str(no))
if key == ' Company fax ':
url2 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
img2 = PILImage.open(BytesIO(requests.get(url2).content))
sheet1.add_image(Image(img2), chr(ord("A") + headers.index(key)) + str(no))
with ThreadPoolExecutor(max_workers=16) as pool:
for i in range(1, 516):
print('Page ' + str(i))
response = requests.get("http://www.chinasspp.com/brand/%E5%A5%B3%E8%A3%85%E5%93%81%E7%89%8C/" + str(i) + "/")
soup = BeautifulSoup(response.text, 'lxml')
soup.select(".brand")
for brand in soup.select(".brand"):
link = brand.select_one('.logo').attrs.get('href')
count += 1
th = pool.submit(parseDetail, count, link)
pool.shutdown(wait=True)
f.save('chinasspp.xlsx')