however PyAudio Errors are often reported during installation :
pip install pyaudio -i https://pypi.tuna.tsinghua.edu.cn/simple
This is for pip install , The source of Tsinghua is used to install . But because pip Cannot resolve dependencies , That's why I made a mistake .
For specific replacement methods, please refer to Help with the use of Tsinghua mirror .
So if it's in Anaconda Under the environment of , You can use conda Command to install .
It can also be configured as a domestic source :
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/msys2/
# Set the channel address when searching
conda config --set show_channel_urls yes
Reprinted address :https://blog.csdn.net/observador/article/details/83618540
Tsinghua official image instructions .
Installation command :conda install pyaudio
If it's in Ubuntu Next , You can use apt To install :sudo apt-get install python-pyaudio
If installed anaconda3, Then other third-party libraries do not need to be installed , If not , Please install as required import Third party Library in .
import pyaudio
import numpy as np
from scipy import fftpack
import wave
import time
Back to speech recognition , We should engage in speech recognition , First, we need to record , There are two ways of recording :
This recording method is relatively rigid , Whether you finish speaking or not , End the recording when it's time , But the advantage of this method is that the code is relatively simple .
def Luyin(filename, times=0):
CHUNK = 1024 # Block size
FORMAT = pyaudio.paInt16 # Bits per acquisition
CHANNELS = 1 # Track number
RATE = 16000 # Sampling rate : The number of times data is collected per second
RECORD_SECONDS = times # Recording time
WAVE_OUTPUT_FILENAME = filename # File storage location
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("* Recording ...")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* End of recording ")
stream.stop_stream()
stream.close()
p.terminate()
if startflag:
with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
The second recording method is more flexible , By setting a threshold , Start recording when the sound is greater than the threshold , Less than the threshold to end recording , Set a certain time , To determine whether it is really the end of the conversation , Instead of pausing .
The following is the code after the combination of the two .
If you need to record regularly, set times It's not equal to 0, If you don't need to , The default is to control the recording switch according to the volume .
def recording(filename, times=0, threshold=7000):
""" :param filename: file name :param time: Recording time , If you specify a time , Record by time , The default is to automatically identify whether to end recording :param threshold: Threshold for judging the end of recording :return: """
CHUNK = 1024 # Block size
FORMAT = pyaudio.paInt16 # Bits per acquisition
CHANNELS = 1 # Track number
RATE = 16000 # Sampling rate : The number of times data is collected per second
RECORD_SECONDS = times # Recording time
WAVE_OUTPUT_FILENAME = filename # File storage location
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("* Recording ...")
frames = []
if times > 0:
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
else:
stopflag = 0
stopflag2 = 0
startflag = False
f3 = False
startTime = time.time()
while True:
data = stream.read(CHUNK)
rt_data = np.frombuffer(data, np.dtype('<i2'))
# print(rt_data*10)
# The Fourier transform
fft_temp_data = fftpack.fft(rt_data, rt_data.size, overwrite_x=True)
fft_data = np.abs(fft_temp_data)[0:fft_temp_data.size // 2 + 1]
# Test threshold , The output value is used to judge the threshold
# print(sum(fft_data) // len(fft_data))
# flags = sum(fft_data) // len(fft_data)
# Determine whether the microphone stops , Judge whether the speech is over ,# Microphone threshold , Default 7000
if sum(fft_data) // len(fft_data) > threshold:
stopflag += 1
startflag = True
f3 = True
stopTime = time.time()
else:
stopflag2 += 1
if f3:
stopTime = time.time()
f3 = False
oneSecond = int(RATE / CHUNK)
if stopflag2 + stopflag > oneSecond:
# if stopflag2 > oneSecond // 3 * 2:
if stopflag2 > oneSecond // 3 * 2 and startflag and time.time() - stopTime >= 1.5 or time.time() - startTime > 8:
break
else:
stopflag2 = 0
stopflag = 0
if startflag:
frames.append(data)
print("* End of recording ")
stream.stop_stream()
stream.close()
p.terminate()
if startflag:
with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
After recording , Just upload the audio file directly , According to Baidu's development documents , Can be directly from json Extract the recognition result from the data .
from aip import AipSpeech
''' Yours APPID AK SK Parameters can be viewed on the console of Baidu cloud voice service '''
APP_ID = ' Yours app id'
API_KEY = ' Yours api key'
SECRET_KEY = ' Yours secret_key'
self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
def shiBie(path):
''' Voice recognition module path: The path of the audio file Output : character string '''
with open(path, 'rb') as fp:
voices = fp.read()
try:
result = self.client.asr(voices, 'wav', 16000, {
'dev_pid': 1537, })
# if result["err_no"] == "0":
result_text = result["result"][0]
print("you said: " + result_text)
return result_text
except KeyError:
print("KeyError")
Speech synthesis is also very simple , If you want to play audio here, you have to install a third-party library playsound
.
Finally, it is suggested to package it into a function , You can call directly when using .
# Import AipSpeech AipSpeech It's speech recognition Python SDK client
from aip import AipSpeech
''' Yours APPID AK SK Parameters can be viewed on the console of Baidu cloud voice service '''
APP_ID = ''
API_KEY = ''
SECRET_KEY = ''
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
result = client.synthesis(' If you want to synthesize ', 'zh', 1, {
'vol': 5,
})
# Recognize the correct return speech binary Error returns dict Refer to the following error code
if not isinstance(result, dict):
with open('auidio.mp3', 'wb') as f:
f.write(result)
from playsound import playsound
#auidio.mp3 It's the name of the document , This must be mp3 file , And it should be consistent with the saved name above
playsound("auido.mp3")