use speech_recognition Library get real-time computer audio input , use requests and json The library uploads the generated voice file to Baidu's cloud speech recognition Identify the server , Return results wx The library is displayed as subtitles , At the same time, write as txt File as record . use threading The library calls two thread, A recognition and processing audio , The other is from txt File reading generates subtitles .
This project is a personal entry . Original name TRAS(Toolkit for Recognition and Automatic Summarization), Speech recognition and automatic summarization tool . Please see my Github. This project can be used to generate subtitles for online classes or voice , It can also help deaf people “ listen ” To computer audio . The code is not standardized , Please forgive me !
The following code is in my Github There are also . Here are notes for the purpose of learning and sharing .
import requests
import json
import base64
import os
import logging
import speech_recognition as sr
import wx
import threading
# Call library
def get_token(): # Call Baidu cloud speech recognition API, See Baidu's technical documents for details
logging.info('Retrieving token...') # and print almost
baidu_server = "https://openapi.baidu.com/oauth/2.0/token?"
grant_type = "client_credentials"
client_id = "EUON57v2pcpk5CDQnet6AN6s" # Yours ID
client_secret = "oHb0INPt5MGSC4LfoQ9hd7W2oSR6GLmV" # secret key
url = f"{baidu_server}grant_type={grant_type}&client_id={client_id}&client_secret={client_secret}"
res = requests.post(url)
token = json.loads(res.text)["access_token"] # use json Processing return data
return token
def audio_baidu(filename): # Upload audio to Baidu cloud speech recognition , The returned result is stored as text
if not os.path.exists('record'):
os.makedirs('record') # Create directory
filename = 'record/' + filename
logging.info('Analysing audio file...')
with open(filename, "rb") as f:
speech = base64.b64encode(f.read()).decode('utf-8')
size = os.path.getsize(filename)
token = get_token()
headers = {
'Content-Type': 'application/json'}
url = "https://vop.baidu.com/server_api"
data = {
"format": "wav",
"rate": "16000",
"dev_pid": 1737, # Identification type .1737=english, 17372=enhanced english, 15372=enhanced chinese, Refer to Baidu technical documents for details
"speech": speech,
"cuid": "3.141592653589793238462643383279502884197169399375105820", # Unique symbol string
"len": size,
"channel": 1,
"token": token,
}
req = requests.post(url, json.dumps(data), headers)
result = json.loads(req.text)
if result["err_msg"] == "success.":
message = ''.join(result['result'])
print('RETURNED: ' + message)
return result['result']
else:
print("RETURNED: Recognition failure")
return -1
def main(): # Threads 2: speech recognition
logging.basicConfig(level=logging.INFO)
wav_num = 0
while True:
r = sr.Recognizer() # Create a recognition class
mic = sr.Microphone() # Create a microphone object
logging.info('Recording...')
with mic as source:
r.adjust_for_ambient_noise(source) # Reduce ambient noise
audio = r.listen(source, timeout=1000) # sound recording ,1000ms Overtime
with open('record/' + f"00{wav_num}.wav", "wb") as f:
f.write(audio.get_wav_data(convert_rate=16000)) # Writing documents
message = ''.join(audio_baidu(f"00{wav_num}.wav"))
history = open('record/' + f"history.txt", "a")
history.write(message + '\n')
history.close()
wav_num += 1
def update_content(win, height=200, width=800): # Used to update the content of the subtitle window
f = open('record/' + f"history.txt", "r") # Read the file
try:
last_line = f.readlines()[-1] # Read the last line of the file
except IndexError:
last_line = ''
if last_line.strip('\n') in ['key point']: # Subtitles are bold if there are special words
logging.info('Emphasized')
ft = wx.Font(80, wx.MODERN, wx.NORMAL, wx.BOLD, False, '') # Set the font
else:
ft = wx.Font(50, wx.MODERN, wx.NORMAL, wx.NORMAL, False, '')
richText = wx.TextCtrl(win, value='', pos=(0, 0), size=(width, height))
richText.SetInsertionPoint(0) # Insert text from the beginning , Top off the original content
richText.SetFont(ft)
richText.SetValue(last_line)
f.close()
return last_line
def show_win(x=320, y=550, height=200, width=800): # Create caption window
win = wx.Frame(None, title="TRAS v1.0.0", pos=(x, y), size=(width, height), style=wx.STAY_ON_TOP) # establish Frame object
win.SetTransparent(1000) # transparency
win.Show()
return win
# The main program
if __name__ == "__main__":
history = open('record/' + f"history.txt", "w+")
history.close()
thread = threading.Thread(target=main) # Create another thread Run speech recognition
thread.start()
global app # There's a mistake here , To set global variables
app = wx.App() # Create objects
while True:
win = show_win() # Create caption window
v = update_content(win) # Update window contents
wx.CallLater(2000, win.Destroy) # If there is no operation for two seconds, hide the window
app.MainLoop()
( Please see the operation method of the whole project Github)
When the program starts running , Speak into the computer microphone , You can successfully display real-time subtitles !
The program currently supports MacOS,WinOS Not tested . choose ws Library is also because it supports Mac Better . For other auxiliary functions, see github. The version here is English recognition , current id What is called with the key is the free quota of the blogger , An error may be reported when the upper limit is reached . Interested students can learn about other functions of Baidu cloud , Register your account to get the call quota . If you have any questions, please leave a message in the private message discussion or comment area !
Winter Olympics Dialogue Syste
1. Purpose of the experimentIm