您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python

Python multi thread downloading black and white learning resource library file

編輯：Python

Preface

the other day （ Zhou ）, Found a resource website for information security, black and white , See a lot of information about information security . Ah, this , People like me, who just want to collect information when they see it, can stand it , However, there are a lot of data , Just write a Python3 Script pending Download

The main reason is that the official website shows that all services should be cancelled this year , This is not downloaded yet ？？

Original script 【 No multithreading 】

import requestsimport reimport timefrom pathlib import Path
url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\ Information security \\\\test\\'# Download path req = requests.get(url)a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text)with open('heibai.txt', 'w+', encoding='utf8') as f:
for i in a:
f.write(i+'\n')
f.close()def mkdir(path):
import os
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return Falsewith open('heibai.txt', 'r', encoding='utf8') as f:
for i in f:
i = i[0:-1]
my_file = Path(path+i)
if not my_file.exists():
if '/' in i:
new = re.findall(r"(.*)/", i)
mkdir(path+new[0])
print('Downloading:'+i+' '+time.asctime(time.localtime(time.time())))
r = requests.get(url2+i)
with open(path+i, "wb") as code:
code.write(r.content)
print("Finnish!")
f.close()

Hang to download to the local , Later I found it was really slow , Consider multi-threaded downloading ？？

Multithreaded scripts

I didn't learn multithreading , I learned the principle temporarily , Then I directly copied the online script and changed it

#! -coding:utf8 -*-import threading,sysimport requestsimport timeimport osimport refrom pathlib import Path
url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\ Information security \\ An Quanshu and notes \\heibai\\'# Download path def txt():
req = requests.get(url)
a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text)
with open('heibai.txt', 'w+', encoding='utf8') as f:
for i in a:
f.write(i+'\n')
f.close()def mkdir(path):
import os
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return Falseclass MulThreadDownload(threading.Thread):
def __init__(self,url,startpos,endpos,f):
super(MulThreadDownload,self).__init__()
self.url = url
self.startpos = startpos
self.endpos = endpos
self.fd = f def download(self):
#print("start thread:%s at %s" % (self.getName(), time.time()))
headers = {"Range":"bytes=%s-%s"%(self.startpos,self.endpos)}
res = requests.get(self.url,headers=headers)
self.fd.seek(self.startpos)
self.fd.write(res.content)
#print("stop thread:%s at %s" % (self.getName(), time.time()))
# f.close()
def run(self):
self.download()Blacklist = [' Mind mapping / Mobile Security /.DS_Store',]txt()with open('heibai.txt', 'r', encoding='utf8') as f:
for filename in f:
filename = filename[0:-1]
if filename in Blacklist:
continue
my_file = Path(path+filename)
if not my_file.exists():
if '/' in filename:
new = re.findall(r"(.*)/", filename)
mkdir(path+new[0])
while 1:
try:
filesize = int(requests.head(url2+filename).headers['Content-Length'])
except Exception:
print('10 Try connecting to the server again in minutes ！')
time.sleep(60*10)
continue
break
print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))
# Number of threads
threadnum = 2
# Semaphore , At the same time, only 2 Threads running
threading.BoundedSemaphore(threadnum)
# Default 2 Thread now , You can also set the number of threads by passing parameters
step = filesize // threadnum
mtd_list = []
start = 0
end = -1
# Please empty and generate the file
tempf = open(path+filename,'w')
tempf.close()
# rb+ , Binary on , It can be read and written anywhere
with open(path+filename,'rb+') as f:
fileno = f.fileno()
# If the file size is 11 byte , That is to get the file 0-10 The location of the data . If end = 10, It indicates that the data has been obtained .
while end < filesize -1:
start = end +1
end = start + step -1
if end > filesize:
end = filesize # print("start:%s, end:%s"%(start,end))
# Copy file handle
dup = os.dup(fileno)
# print(dup)
# Open file
fd = os.fdopen(dup,'rb+',-1)
# print(fd)
t = MulThreadDownload(url,start,end,fd)
t.start()
mtd_list.append(t)
for i in mtd_list:
i.join()

an account of happenings after the event being told

This is a Idle panic , The hard disk is idle Written Somewhat chaotic Script Mind mapping / Mobile Security /.DS_Store This file is blocked by the website , Can't download It seems that multithreading is too fast , The server sent me ip to ban For a while , So whether to use multithreaded scripts depends on your network speed Later, the delay retry connection to the server was added to the multithread , You should be able to download it at night

I only add reconnection in the multi-threaded download part of the script , So if the script is banIP, The script reports an error

Download the original script for a while , The server will time out , The script gets stuck later , Sleep directly between file downloads 5s, It seems that the effect is OK

if not my_file.exists():
if '/' in filename:
new = re.findall(r"(.*)/", filename)
mkdir(path+new[0])
print(' To prevent from being ban, Pause 5s in ...')
time.sleep(5)
while 1:
try:
filesize = int(requests.head(url2+filename).headers['Content-Length'])
except Exception:
print('10 Try connecting to the server again in minutes ！')
time.sleep(60*10)
continue
break
print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))