程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

Python crawler series Mingtong market data crawling

編輯:Python

Python Crawler series of Mingtong market data crawling

The applet crawler receives the order 、app Crawler receiving order 、 Web crawlers receive orders 、 Interface customization 、 Website development 、 Applet development > Click here to contact us <

Please scan the QR code below for wechat

The code is for learning and communication only , Do not use for illegal purposes , Encryption algorithms do not provide , For reference only

Go straight to the code

import requests
import json
import time
import configparser
from queue import Queue
import os
import xlrd
import urllib.parse
import xlwt
'''
The code is for learning only , Do not use abnormally
'''
headers = {

"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.15(0x17000f31) NetType/WIFI Language/zh_CN",
}
excelTitle = [" date ", " Product quotation ", " Shop "]
excelPwd = os.getcwd() + "/excels/"
if not os.path.exists(gexcelPwd):
os.mkdir(gexcelPwd)
cf = configparser.ConfigParser()
try:
cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")
except Exception as e:
print(" Program directory does not exist conf.ini The configuration file ~")
exit(0)
def getConf(sec, key):
try:
return cf.get(sec, key)
except Exception as e:
print(e)
print(" The following configuration is not available :" + sec + " - " + key)
exit(0)
threadNums = 1
try:
threadNums = int(getConf("app-sys", "threadNums"))
if threadNums <= 0:
threadNums = 1
except Exception as e:
threadNums = 1
def postHtml(url, data):
for i in range(3):
try:
resp = requests.post(url, data=data, headers=headers)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
pass
def getSign(page):
while True:
try:
resp = os.popen('node encrypt.js ' + str(page))
return resp.buffer.read()
except Exception as e:
pass
def getCurrentTime():
return str(time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())))
class mtSpider(threading.Thread):
def __init__(self, keywordQueue, index, *args, **kwargs):
super(mtSpider, self).__init__(*args, **kwargs)
self.keywordQueue = keywordQueue
self.excelPath = gexcelPwd + "data_" + getCurrentTime() + "_" + str(index) + ".xls"
def writeExcel(self, data):
print("-" * 10)
print(data)
print("-" * 10)
try:
workbook = xlrd.open_workbook(self.excelPath)
sheets = workbook.sheet_names()
worksheet = workbook.sheet_by_name(sheets[0])
rows_old = worksheet.nrows
new_workbook = copy(workbook)
new_worksheet = new_workbook.get_sheet(0)
for j in range(0, len(data)):
try:
new_worksheet.write(rows_old, j, str(data[j]))
except Exception as e:
continue
new_workbook.save(self.excelPath)
except Exception as e:
pass
def getGoodsList(self, keyword, page):
sign = getSign(page)
url = "https://www.mtzh.ltd/api/all/AllProduct/PostSearchNew?sign=" + sign + "&word=" + str(keyword) + "&OpenID=" + OpenID
data = {

"sign": sign,
"word": keyword,
"OpenID": OpenID,
}
res = postHtml(url, data)
try:
return res['Data']
except Exception as e:
pass
def run(self):
self.initExcel()
while True:
if self.keywordQueue.empty():
break
keyword = self.keywordQueue.get()
crawlerNum = keyword['crawlerNum']
currNums = 0
page = 1
stop = False
while True:
goodsList = self.getGoodsList(keywords, page)
if goodsList and len(goodsList) > 0:
for goods in goodsList:
try:
data = []
ModifyDate = ""
ShopNumber = ""
ShopName = ""
try:
ModifyDate = goods['ModifyDate']
except Exception as e:
pass
try:
ShopName = goods['ShopName']
except Exception as e:
pass
data.append(ModifyDate)
data.append(ShopNumber + "\n" + ShopName)
self.writeExcel(data)
currNums += 1
if currNums >= crawlerNum:
stop = True
break
except Exception as e:
pass
if stop:
break
page += 1
time.sleep(5)
else:
break
def getKeywordsQueue():
keywordQueue = Queue(0)
try:
fs = os.listdir(excelPwd)
try:
for f in fs:
try:
tpath = excelPwd + f
df = pds.read_excel(tpath, encoding="utf-8")
rows = df.iterrows()
for row in rows:
try:
rowData = row[1]
keywords = rowData[' key word ']
crawlerNum = 1000
try:
crawlerNum = int(rowData[' Collection quantity ']) if int(rowData[' Collection quantity ']) > 0 else 1000
except Exception as e:
crawlerNum = 1000
if not pds.isnull(keywords):
keywordQueue.put({
"keyword": keywords, "crawlerNum": crawlerNum})
except Exception as e:
pass
except Exception as e:
pass
except Exception as e:
pass
except Exception as e:
pass
return keywordQueue
def main():
global threadNums
keywordLen = keywordQueue.qsize()
if keywordLen > 0:
for i in range(threadNums):
m = mtSpider(keywordQueue, i)
m.start()
else:
print(" No... Was read excel key word , Please check excel Standard or not !")
if __name__ == '__main__':
main()

  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved