Hi. , Big guys, good duck ! This is the little panda ~
Today we collect well-known shipin Barrage website !
There are timely new animation , Active ACG atmosphere , original Up Lord .
You can find a lot of joy here .
< Environment variable needs to be set > Use of software Synthesize video and audio
We want the data content to come from the web page source code
I. Open the collected web address with a browser
II. Right click on this page to view its source code A new window will pop up
III. ctrl + F Open the search box Search for playinfo Relevant information and data can be found
IV. ctrl + F Open the search box Search for shipin title , You can also find relevant data content
If you are not a built-in module, remember to install it ~
import requests # Data request module < Send request tool >
import re # Regular expressions
import json # Serialization and de sequencing
import pprint # Format output module
import subprocess
import os
Due to the audit mechanism , I deleted some things from the website , Xiao Kenai can add it by themselves , It's easy
There are two more words , I used Pinyin instead of , You can change back to the text ~
If there is a little lazy or not able to change, Xiao Kenai can also confide in me , I sent you ~
def get_response(html_url, data=None):
""" Send request function def keyword For custom functions get_response Custom function name :param html_url: Formal parameters < It doesn't have practical significance > Pass in this function parameter Send request URL :return: The response object Impersonate a browser to send a request headers Request header to perform camouflage simulation user-agent User agent means Represents the basic identity of the browser <Response [200]> <> stay python It represents the object The response object 200 Status code Indicates that the request was successful It's like making a phone call , Get through the beep 404 Wrong URL The number you dialed is empty 503 There's a problem with the server The number you dialed is not in the service area 403 You don't have access The number you dialed is in the middle of a call """
headers = {
'referer': 'https://www..com/', # Anti theft chain Tell the server that you requested url Where did you jump from
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
response = requests.get(url=html_url, params=data, headers=headers)
return response
def get_video_info(play_url):
""" obtain shipin Information functions :param play_url: shipin Play details page :return: Information shipin Information Look up at the stars and keep your feet on the ground Can't use regular 6 Through the relevant grammar , You can extract what you want from string data Parsing data re.findall() adopt re Inside the module findall To find all the data we want Where to find what kind of data content from response.text Go inside "title":"(.*?)","pubdate" in (.*?) This is the data we want '"title":"(.*?)","pubdate"', response.text Save the data , Save title as file name , In the form of string data List value len Number of statistical elements [0] Take value according to index position The list index position is from 0 Start counting type() Built in functions , View data type """
# Defined the function , Be sure to call
response = get_response(html_url=play_url) # Call the previously defined send request function Functions can be called repeatedly
# print(response.text) # response.text Get the text data of the response object < Get web source code > String data
title = re.findall('"title":"(.*?)","pubdate"', response.text)[0].replace(' ', '') # title
title = re.sub(r'[/\:*?"<>|]', '', title)
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0] # Video information
# print(title)
# print(html_data)
# print(type(html_data))
# For the convenience of data extraction , You can put this html_data Turn into json Dictionary data
json_data = json.loads(html_data)
# print(json_data)
# print(type(json_data)) # Output one line
# String single and double quotation marks are used The outside is a single quotation mark, and the inside is a double quotation mark
# pprint.pprint(json_data) # Format expansion effect
# Dictionary values , Key value pair value According to the content to the left of the colon < key >, Extract the content to the right of the colon < value >
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
print(audio_url)
print(video_url)
video_info = [title, audio_url, video_url]
return video_info
def save(title, audio_url, video_url):
""" Save data function :param title: shipin title :param audio_url: Audio url :param video_url: shipin The picture url :return: """
audio_content = get_response(html_url=audio_url).content
video_content = get_response(html_url=video_url).content
with open('video\\' + title + '.mp3', mode='wb') as f:
f.write(audio_content)
with open('video\\' + title + '.mp4', mode='wb') as f:
f.write(video_content)
cmd = f"ffmpeg -i video\\{
title}.mp4 -i video\\{
title}.mp3 -c:v copy -c:a aac -strict experimental video\\{
title}output.mp4"
subprocess.run(cmd, shell=True)
os.remove(f'video\\{
title}.mp4')
os.remove(f'video\\{
title}.mp3')
print('')
print(title, ' Video download complete ')
def get_search(page, word):
""" :param page: How many pages are collected :param word: Search for keywords :return: """
search_url = 'https://api..com/x/web-interface/search/type'
data = {
'__refresh__': 'true',
'_extra': '',
'context': '',
'page': page,
'page_size': '42',
'from_source': '',
'from_spmid': '333.337',
'platform': 'pc',
'highlight': '1',
'single_column': '0',
'keyword': word,
'category_id': '',
'search_type': 'video',
'dynamic_offset': '84',
'preload': 'true',
'com2co': 'true',
}
json_data = get_response(html_url=search_url, data=data).json()
bv_list = [i['bvid'] for i in json_data['data']['result']]
print(bv_list)
return bv_list
# 6. Collect multiple shipin
def get_up_video(page, up_id):
""" collection up Multiple shipin :param page: How many pages are collected :param up_id: shipin Blogger ID :return: """
up_link = 'https://api..com/x/space/arc/search'
data = {
'mid': up_id,
'ps': '30',
'tid': '0',
'pn': page,
'keyword': '',
'order': 'pubdate',
'jsonp': 'jsonp',
}
json_data = get_response(html_url=up_link, data=data).json()
bv_list = [i['bvid'] for i in json_data['data']['list']['vlist']]
print(bv_list)
return bv_list
# 7. obtain id
def main(bv_id):
""" The main function :param bv_id: shipin bv Number :return: """
video_info = get_video_info(play_url=f'https://www..com/video/{
bv_id}')
save(video_info[0], video_info[1], video_info[2])
if __name__ == '__main__':
# As long as you can see the data
msg = """ Please input what you want to do : A. Multi page data collection B. Collect a single shipin C. Collect Fanju D. Collect a Up all shipin Content 0. You can exit the system """
while True:
print(msg)
kew_word = input(' Please input what you want to do : ')
if kew_word == 'A' or kew_word == 'a':
word = input(' Please enter what you want to download shipin keyword : ')
page = input(' Please enter what you want to download shipin the number of pages : ')
for num in range(1, int(page) + 1):
bv_list = get_search(page=num, word=word)
elif kew_word == 'B' or kew_word == 'b':
bv = input(' Please enter what you want to download shipinBv Number : ')
elif kew_word == 'C' or kew_word == 'c':
print(' The function has not been launched yet ')
elif kew_word == 'D' or kew_word == 'd':
up_id = input(' Please enter what you want to download up Lord ID: ')
page = input(' Please enter what you want to download shipin the number of pages : ')
for num in range(1, int(page) + 1):
get_up_video(page=num, up_id=up_id)
elif kew_word == '0':
break
There is no fast track to success , There is no highway to happiness .
All the successes , All come from tireless efforts and running , All happiness comes from ordinary struggle and persistence
—— Inspirational quotes
This article is finished ~ Interested partners can copy the code to try
Your support is my biggest motivation !! Remember Sanlian ~ Welcome to read previous articles ~
I'm a panda , See you in the next article