the other day ( Zhou ), Found a resource website for information security, black and white , See a lot of information about information security . Ah, this , People like me, who just want to collect information when they see it, can stand it , However, there are a lot of data , Just write a Python3 Script pending Download
The main reason is that the official website shows that all services should be cancelled this year , This is not downloaded yet ??
import requestsimport reimport timefrom pathlib import Path url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\ Information security \\\\test\\'# Download path req = requests.get(url)a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text)with open('heibai.txt', 'w+', encoding='utf8') as f: for i in a: f.write(i+'\n') f.close()def mkdir(path): import os path = path.strip() isExists = os.path.exists(path) if not isExists: os.makedirs(path) return True else: return Falsewith open('heibai.txt', 'r', encoding='utf8') as f: for i in f: i = i[0:-1] my_file = Path(path+i) if not my_file.exists(): if '/' in i: new = re.findall(r"(.*)/", i) mkdir(path+new[0]) print('Downloading:'+i+' '+time.asctime(time.localtime(time.time()))) r = requests.get(url2+i) with open(path+i, "wb") as code: code.write(r.content) print("Finnish!") f.close()
Hang to download to the local , Later I found it was really slow , Consider multi-threaded downloading ??
I didn't learn multithreading , I learned the principle temporarily , Then I directly copied the online script and changed it
#! -coding:utf8 -*-import threading,sysimport requestsimport timeimport osimport refrom pathlib import Path url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\ Information security \\ An Quanshu and notes \\heibai\\'# Download path def txt(): req = requests.get(url) a = re.findall(r"<a href = '\./\./(.*)' target = ", req.text) with open('heibai.txt', 'w+', encoding='utf8') as f: for i in a: f.write(i+'\n') f.close()def mkdir(path): import os path = path.strip() isExists = os.path.exists(path) if not isExists: os.makedirs(path) return True else: return Falseclass MulThreadDownload(threading.Thread): def __init__(self,url,startpos,endpos,f): super(MulThreadDownload,self).__init__() self.url = url self.startpos = startpos self.endpos = endpos self.fd = f def download(self): #print("start thread:%s at %s" % (self.getName(), time.time())) headers = {"Range":"bytes=%s-%s"%(self.startpos,self.endpos)} res = requests.get(self.url,headers=headers) self.fd.seek(self.startpos) self.fd.write(res.content) #print("stop thread:%s at %s" % (self.getName(), time.time())) # f.close() def run(self): self.download()Blacklist = [' Mind mapping / Mobile Security /.DS_Store',]txt()with open('heibai.txt', 'r', encoding='utf8') as f: for filename in f: filename = filename[0:-1] if filename in Blacklist: continue my_file = Path(path+filename) if not my_file.exists(): if '/' in filename: new = re.findall(r"(.*)/", filename) mkdir(path+new[0]) while 1: try: filesize = int(requests.head(url2+filename).headers['Content-Length']) except Exception: print('10 Try connecting to the server again in minutes !') time.sleep(60*10) continue break print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time()))) # Number of threads threadnum = 2 # Semaphore , At the same time, only 2 Threads running threading.BoundedSemaphore(threadnum) # Default 2 Thread now , You can also set the number of threads by passing parameters step = filesize // threadnum mtd_list = [] start = 0 end = -1 # Please empty and generate the file tempf = open(path+filename,'w') tempf.close() # rb+ , Binary on , It can be read and written anywhere with open(path+filename,'rb+') as f: fileno = f.fileno() # If the file size is 11 byte , That is to get the file 0-10 The location of the data . If end = 10, It indicates that the data has been obtained . while end < filesize -1: start = end +1 end = start + step -1 if end > filesize: end = filesize # print("start:%s, end:%s"%(start,end)) # Copy file handle dup = os.dup(fileno) # print(dup) # Open file fd = os.fdopen(dup,'rb+',-1) # print(fd) t = MulThreadDownload(url,start,end,fd) t.start() mtd_list.append(t) for i in mtd_list: i.join()
This is a Idle panic , The hard disk is idle
Written Somewhat chaotic
Script Mind mapping / Mobile Security /.DS_Store
This file is blocked by the website , Can't download It seems that multithreading is too fast , The server sent me ip to ban For a while , So whether to use multithreaded scripts depends on your network speed Later, the delay retry connection to the server was added to the multithread , You should be able to download it at night
I only add reconnection in the multi-threaded download part of the script , So if the script is banIP, The script reports an error
Download the original script for a while , The server will time out , The script gets stuck later , Sleep directly between file downloads 5s, It seems that the effect is OK
if not my_file.exists(): if '/' in filename: new = re.findall(r"(.*)/", filename) mkdir(path+new[0]) print(' To prevent from being ban, Pause 5s in ...') time.sleep(5) while 1: try: filesize = int(requests.head(url2+filename).headers['Content-Length']) except Exception: print('10 Try connecting to the server again in minutes !') time.sleep(60*10) continue break print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))