1

DeepSpeech を使用してオーディオをテキストに変換しようとすると、Mozilla/DeepSpeech のデフォルトのオーディオ ファイルで問題なく動作します。しかし、PC のマイクからオーディオを録音してモデルにフィードしようとすると、エラー (「wave.Error: unknown format: 3」) が発生します。サウンドデバイス ライブラリを使用してオーディオを録音しています。サンプルレートとチャンネル数を変更しようとしましたが、うまくいきませんでした。エラーの原因が「波」機能であることはわかっていますが、わかりませんでした。助けてください

私のコード:

from deepspeech import Model
import numpy as np
import os
import wave
from playsound import playsound
import sounddevice as sd
from scipy.io.wavfile import write
model_file_path = 'deepspeech-0.8.2-models.pbmm'
lm_file_path = 'deepspeech-0.9.3-models.scorer'

beam_width = 500
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)

model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)


def read_wav_file(filename):
    with wave.open(filename, 'rb') as w:
        rate = w.getframerate()
        frames = w.getnframes()
        buffer = w.readframes(frames)
        print("Rate:", rate)
        print("Frames:", frames)
        print("Buffer Len:", len(buffer))

    return buffer, rate


def transcribe_batch(audio_file):
    buffer, rate = read_wav_file(audio_file)
    data16 = np.frombuffer(buffer, dtype=np.int16)
    return model.stt(data16)


fs = 48000  # Sample rate
seconds = 5  # Duration of recording
myrecording = sd.rec(int(seconds * fs), samplerate=fs,     channels=1)
sd.wait()  # Wait until recording is finished
write('output.wav', fs, myrecording)  # Save as WAV file
playsound('output.wav')


trans = transcribe_batch('output.wav')
print("trancribed message: ", trans)

with open('subtitle.txt', 'w') as f:
   f.write(trans)


print("executed successfully")

エラー:

TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
2021-11-14 00:57:24.539394: I tensorflow/core/platform/cpu_feature_guard.cc:142] This     TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the     following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
  File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 48, in <module>
    trans = transcribe_batch('output.wav')
  File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 35, in transcribe_batch
    buffer, rate = read_wav_file(audio_file)
  File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 23, in read_wav_file
    with wave.open(filename, 'rb') as w:
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 499, in open
    return Wave_read(f)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 163, in     __init__
    self.initfp(f)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 143, in initfp
    self._read_fmt_chunk(chunk)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 260, in _read_fmt_chunk
    raise Error('unknown format: %r' % (wFormatTag,))
wave.Error: unknown format: 3
4

0 に答える 0