Higher``sample_rate``valuesresultinbetteraudioquality,butalsomorebandwidth(andtherefore,slowerrecognition).Additionally,someCPUs,suchasthoseinolderRaspberryPimodels,can't keep up if this value is too high.
ifdevice_indexisnotNone:# ensure device index is in range
assert0<=device_index<count,"Device index out of range ({} devices available; device index should be between 0 and {} inclusive)".format(count,count-1)
ifsample_rateisNone:# automatically set the sample rate to the hardware's default sample rate if not specified
assertisinstance(device_info.get("defaultSampleRate"),(float,int))anddevice_info["defaultSampleRate"]>0,"Invalid device info returned from PyAudio: {}".format(device_info)
sample_rate=int(device_info["defaultSampleRate"])
finally:
audio.terminate()
self.device_index=device_index
self.format=self.pyaudio_module.paInt16# 16-bit int sampling
self.SAMPLE_WIDTH=self.pyaudio_module.get_sample_size(self.format)# size of each sample
self.SAMPLE_RATE=sample_rate# sampling rate in Hertz
self.CHUNK=chunk_size# number of frames stored in each buffer
self.audio=None
self.stream=None
@staticmethod
defget_pyaudio():
"""
Importsthepyaudiomoduleandchecksitsversion.Throwsexceptionsifpyaudiocan't be found or a wrong version is installed
"""
try:
importpyaudiowpatchaspyaudio
exceptImportError:
raiseAttributeError("Could not find PyAudio; check installation")
raiseAttributeError("PyAudio 0.2.11 or later is required (found version {})".format(pyaudio.__version__))
returnpyaudio
@staticmethod
deflist_microphone_names():
"""
Returnsalistofthenamesofallavailablemicrophones.Formicrophoneswherethenamecan't be retrieved, the list entry contains ``None`` instead.
Theindexofeachmicrophone's name in the returned list is the same as its device index when creating a ``Microphone`` instance - if you want to use the microphone at index 3 in the returned list, use ``Microphone(device_index=3)``.
assertisinstance(device_info.get("defaultSampleRate"),(float,int))anddevice_info["defaultSampleRate"]>0,"Invalid device info returned from PyAudio: {}".format(device_info)
assertisinstance(filename_or_fileobject,(type(""),type(u"")))orhasattr(filename_or_fileobject,"read"),"Given audio file must be a filename string or a file-like object"
# 24-bit audio needs some special handling for old Python versions (workaround for https://bugs.python.org/issue12866)
samples_24_bit_pretending_to_be_32_bit=False
ifself.SAMPLE_WIDTH==3:# 24-bit audio
try:audioop.bias(b"",self.SAMPLE_WIDTH,0)# test whether this sample width is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
exceptaudioop.error:# this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
samples_24_bit_pretending_to_be_32_bit=True# while the ``AudioFile`` instance will outwardly appear to be 32-bit, it will actually internally be 24-bit
self.SAMPLE_WIDTH=4# the ``AudioFile`` instance should present itself as a 32-bit stream now, since we'll be converting into 32-bit on the fly when reading
ifnothasattr(self.filename_or_fileobject,"read"):# only close the file if it was opened by this class in the first place (if the file was originally given as a path)
self.audio_reader=audio_reader# an audio file object (e.g., a `wave.Wave_read` instance)
self.little_endian=little_endian# whether the audio data is little-endian (when working with big-endian things, we'll have to convert it to little-endian before we process it)
self.samples_24_bit_pretending_to_be_32_bit=samples_24_bit_pretending_to_be_32_bit# this is true if the audio is 24-bit audio, but 24-bit audio isn't supported, so we have to pretend that this is 32-bit audio and convert it on the fly
ifnotisinstance(buffer,bytes):buffer=b""# workaround for https://bugs.python.org/issue24608
sample_width=self.audio_reader.getsampwidth()
ifnotself.little_endian:# big endian format, convert to little endian on the fly
ifhasattr(audioop,"byteswap"):# ``audioop.byteswap`` was only added in Python 3.4 (incidentally, that also means that we don't need to worry about 24-bit audio being unsupported, since Python 3.4+ always has that functionality)
buffer=audioop.byteswap(buffer,sample_width)
else:# manually reverse the bytes of each sample, which is slower but works well enough as a fallback
# workaround for https://bugs.python.org/issue12866
ifself.samples_24_bit_pretending_to_be_32_bit:# we need to convert samples from 24-bit to 32-bit before we can process them with ``audioop`` functions
buffer=b"".join(b"\x00"+buffer[i:i+sample_width]foriinrange(0,len(buffer),sample_width))# since we're in little endian, we prepend a zero byte to each 24-bit sample to get a 32-bit sample
sample_width=4# make sure we thread the buffer as 32-bit audio now, after converting it from 24-bit audio
self.energy_threshold=300# minimum audio energy to consider for recording
self.dynamic_energy_threshold=True
self.dynamic_energy_adjustment_damping=0.15
self.dynamic_energy_ratio=1.5
self.pause_threshold=0.8# seconds of non-speaking audio before a phrase is considered complete
self.operation_timeout=None# seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
self.phrase_threshold=0.3# minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
self.non_speaking_duration=0.5# seconds of non-speaking audio to keep on both sides of the recording
assertisinstance(source,AudioSource),"Source must be an audio source"
assertsource.streamisnotNone,"Audio source must be entered before recording, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
assertisinstance(source,AudioSource),"Source must be an audio source"
assertsource.streamisnotNone,"Audio source must be entered before adjusting, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
assertisinstance(source,AudioSource),"Source must be an audio source"
assertsource.streamisnotNone,"Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
assertos.path.isfile(os.path.join(snowboy_configuration[0],"snowboydetect.py")),"``snowboy_configuration[0]`` must be a Snowboy root directory containing ``snowboydetect.py``"
forhot_word_fileinsnowboy_configuration[1]:
assertos.path.isfile(hot_word_file),"``snowboy_configuration[1]`` must be a list of Snowboy hot word configuration files"
pause_buffer_count=int(math.ceil(self.pause_threshold/seconds_per_buffer))# number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
phrase_buffer_count=int(math.ceil(self.phrase_threshold/seconds_per_buffer))# minimum number of buffers of speaking audio before we consider the speaking audio a phrase
non_speaking_buffer_count=int(math.ceil(self.non_speaking_duration/seconds_per_buffer))# maximum number of buffers of non-speaking audio to retain before and after a phrase
# read audio input for phrases until there is a phrase that is long enough
elapsed_time=0# number of seconds of audio read
buffer=b""# an empty buffer means that the stream has ended and there is no data left to read
whileTrue:
frames=collections.deque()
ifsnowboy_configurationisNone:
# store audio input until the phrase starts
whileTrue:
# handle waiting too long for phrase by raising an exception
elapsed_time+=seconds_per_buffer
iftimeoutandelapsed_time>timeout:
raiseWaitTimeoutError("listening timed out while waiting for phrase to start")
buffer=source.stream.read(source.CHUNK)
iflen(buffer)==0:break# reached end of the stream
frames.append(buffer)
iflen(frames)>non_speaking_buffer_count:# ensure we only keep the needed amount of non-speaking buffers
frames.popleft()
# detect whether speaking has started on audio input
energy=audioop.rms(buffer,source.SAMPLE_WIDTH)# energy of the audio signal
ifenergy>self.energy_threshold:break
# dynamically adjust the energy threshold using asymmetric weighted average
ifself.dynamic_energy_threshold:
damping=self.dynamic_energy_adjustment_damping**seconds_per_buffer# account for different chunk sizes and rates
assertisinstance(audio_data,AudioData),"``audio_data`` must be audio data"
assertisinstance(language,str)or(isinstance(language,tuple)andlen(language)==3),"``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``"
assertkeyword_entriesisNoneorall(isinstance(keyword,(type(""),type(u"")))and0<=sensitivity<=1forkeyword,sensitivityinkeyword_entries),"``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
# import the PocketSphinx speech recognition module
try:
frompocketsphinximportpocketsphinx,Jsgf,FsgModel
exceptImportError:
raiseRequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
exceptValueError:
raiseRequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.")
config.set_string("-logfn",os.devnull)# disable logging (logging causes unwanted output in terminal)
decoder=pocketsphinx.Decoder(config)
# obtain audio data
raw_data=audio_data.get_raw_data(convert_rate=16000,convert_width=2)# the included language models require audio to be 16-bit mono 16 kHz in little-endian format
# obtain recognition results
ifkeyword_entriesisnotNone:# explicitly specified set of keywords
withPortableNamedTemporaryFile("w")asf:
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"``audio_data`` must be audio data"
assertkeyisNoneorisinstance(key,str),"``key`` must be ``None`` or a string"
assertisinstance(language,str),"``language`` must be a string"
flac_data=audio_data.get_flac_data(
convert_rate=Noneifaudio_data.sample_rate>=8000else8000,# audio samples must be at least 8 kHz
If``preferred_phrases``isaniterableofphrasestrings,thosegivenphraseswillbemorelikelytoberecognizedoversimilar-soundingalternatives.Thisisusefulforthingslikekeyword/commandrecognitionoraddingnewphrasesthataren't in Google'svocabulary.NotethattheAPIimposescertain`restrictionsonthelistofphrasestrings<https://cloud.google.com/speech/limits#content>`__.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthecredentialsaren't valid, or if there is no Internet connection.
"""
assertisinstance(audio_data,AudioData),"``audio_data`` must be audio data"
assertisinstance(language,str),"``language`` must be a string"
assertpreferred_phrasesisNoneorall(isinstance(preferred_phrases,(type(""),type(u"")))forpreferred_phrasesinpreferred_phrases),"``preferred_phrases`` must be a list of strings"
convert_rate=Noneif8000<=audio_data.sample_rate<=48000elsemax(8000,min(audio_data.sample_rate,48000)),# audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
TogettheAPIkeyforaWit.aiapp,gototheapp's overview page, go to the section titled "Make an API request", and look for something along the lines of ``Authorization: Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX``; ``XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX`` is the API key. Wit.ai API keys are 32-character uppercase alphanumeric strings.
Returnsthemostlikelytranscriptionif``show_all``isfalse(thedefault).Otherwise,returnsthe`rawAPIresponse<https://wit.ai/docs/http/20141022#get-intent-via-text-link>`__ as a JSON dictionary.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"Data must be audio data"
assertisinstance(key,str),"``key`` must be a string"
wav_data=audio_data.get_wav_data(
convert_rate=Noneifaudio_data.sample_rate>=8000else8000,# audio samples must be at least 8 kHz
TogettheAPIkey,gotothe`MicrosoftAzurePortalResources<https://portal.azure.com/>`__page,goto"All Resources">"Add">"See All">Search"Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "AllResources" page in the Azure Portal), go to the "ShowAccessKeys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings.
Therecognitionlanguageisdeterminedby``language``,aBCP-47languagetaglike``"en-US"``(USEnglish)or``"fr-FR"``(InternationalFrench),defaultingtoUSEnglish.Alistofsupportedlanguagevaluescanbefoundinthe`APIdocumentation<https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
Returnsthemostlikelytranscriptionif``show_all``isfalse(thedefault).Otherwise,returnsthe`rawAPIresponse<https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"Data must be audio data"
assertisinstance(key,str),"``key`` must be a string"
# assert isinstance(result_format, str), "``format`` must be a string" # simple|detailed
assertisinstance(language,str),"``language`` must be a string"
self.azure_cached_access_token_expiry=start_time+600# according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes
wav_data=audio_data.get_wav_data(
convert_rate=16000,# audio samples must be 8kHz or 16 kHz
TogettheAPIkey,gotothe`MicrosoftAzurePortalResources<https://portal.azure.com/>`__page,goto"All Resources">"Add">"See All">Search"Bing Speech API > "Create", and fill in the form to make a "BingSpeechAPI" resource. On the resulting page (which is also accessible from the "AllResources" page in the Azure Portal), go to the "ShowAccessKeys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings.
Therecognitionlanguageisdeterminedby``language``,aBCP-47languagetaglike``"en-US"``(USEnglish)or``"fr-FR"``(InternationalFrench),defaultingtoUSEnglish.Alistofsupportedlanguagevaluescanbefoundinthe`APIdocumentation<https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
Returnsthemostlikelytranscriptionif``show_all``isfalse(thedefault).Otherwise,returnsthe`rawAPIresponse<https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"Data must be audio data"
assertisinstance(key,str),"``key`` must be a string"
assertisinstance(language,str),"``language`` must be a string"
self.bing_cached_access_token_expiry=start_time+600# according to https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition, the token expires in exactly 10 minutes
wav_data=audio_data.get_wav_data(
convert_rate=16000,# audio samples must be 8kHz or 16 kHz
TheHoundifyclientIDandclientkeyarespecifiedby``client_id``and``client_key``,respectively.Unfortunately,thesearenotavailablewithout`signingupforanaccount<https://www.houndify.com/signup>`__.Onceloggedintothe`dashboard<https://www.houndify.com/dashboard>`__,youwillwanttoselect"Register a new client",andfillintheformasnecessary.Whenatthe"Enable Domains"page,enablethe"Speech To Text Only"domain,andthenselect"Save & Continue".
TogettheclientIDandclientkeyforaHoundifyclient,gotothe`dashboard<https://www.houndify.com/dashboard>`__andselecttheclient's "View Details" link. On the resulting page, the client ID and client key will be visible. Client IDs and client keys are both Base64-encoded strings.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"Data must be audio data"
assertisinstance(client_id,str),"``client_id`` must be a string"
assertisinstance(client_key,str),"``client_key`` must be a string"
wav_data=audio_data.get_wav_data(
convert_rate=Noneifaudio_data.sample_ratein[8000,16000]else16000,# audio samples must be 8 kHz or 16 kHz
TheIBMSpeechtoTextusernameandpasswordarespecifiedby``username``and``password``,respectively.Unfortunately,thesearenotavailablewithout`signingupforanaccount<https://console.ng.bluemix.net/registration/>`__.OnceloggedintotheBluemixconsole,followtheinstructionsfor`creatinganIBMWatsonserviceinstance<https://www.ibm.com/watson/developercloud/doc/getting_started/gs-credentials.shtml>`__,wheretheWatsonserviceis"Speech To Text".IBMSpeechtoTextusernamesarestringsoftheformXXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX,whilepasswordsaremixed-casealphanumericstrings.
Therecognitionlanguageisdeterminedby``language``,anRFC5646languagetagwithadialectlike``"en-US"``(USEnglish)or``"zh-CN"``(MandarinChinese),defaultingtoUSEnglish.Thesupportedlanguagevaluesarelistedunderthe``model``parameterofthe`audiorecognitionAPIdocumentation<https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__, in the form ``LANGUAGE_BroadbandModel``, where ``LANGUAGE`` is the language value.
Returnsthemostlikelytranscriptionif``show_all``isfalse(thedefault).Otherwise,returnsthe`rawAPIresponse<https://www.ibm.com/watson/developercloud/speech-to-text/api/v1/#sessionless_methods>`__ as a JSON dictionary.
Raisesa``speech_recognition.UnknownValueError``exceptionifthespeechisunintelligible.Raisesa``speech_recognition.RequestError``exceptionifthespeechrecognitionoperationfailed,ifthekeyisn't valid, or if there is no internet connection.
"""
assertisinstance(audio_data,AudioData),"Data must be audio data"
assertisinstance(key,str),"``key`` must be a string"
flac_data=audio_data.get_flac_data(
convert_rate=Noneifaudio_data.sample_rate>=16000else16000,# audio samples should be at least 16 kHz
convert_width=Noneifaudio_data.sample_width>=2else2# audio samples should be at least 16-bit
"""Limited replacement for ``tempfile.NamedTemporaryFile``, except unlike ``tempfile.NamedTemporaryFile``, the file can be opened again while it's currently open, even on Windows."""
def__init__(self,mode="w+b"):
self.mode=mode
def__enter__(self):
# create the temporary file and open it
file_descriptor,file_path=tempfile.mkstemp()
self._file=os.fdopen(file_descriptor,self.mode)
# the name property is a public field
self.name=file_path
returnself
def__exit__(self,exc_type,exc_value,traceback):
self._file.close()
os.remove(self.name)
defwrite(self,*args,**kwargs):
returnself._file.write(*args,**kwargs)
defwritelines(self,*args,**kwargs):
returnself._file.writelines(*args,**kwargs)
defflush(self,*args,**kwargs):
returnself._file.flush(*args,**kwargs)
# ===============================
# backwards compatibility shims
# ===============================
WavFile=AudioFile# WavFile was renamed to AudioFile in 3.4.1
Recognizer.recognize_api=classmethod(recognize_api)# API.AI Speech Recognition is deprecated/not recommended as of 3.5.0, and currently is only optionally available for paid plans