[add] Speech to text feature.

This commit is contained in:
2026-04-05 16:07:03 +02:00
parent 049d4f2a18
commit 81e9760e5c
7 changed files with 213 additions and 4 deletions
+1
View File
@@ -2,3 +2,4 @@ __pycache__/
*.pyc
*.pyo
save.qzl
vosk-model-*/
+59
View File
@@ -31,6 +31,65 @@ You can also run it directly as a module without installing:
uv run python -m h2g2.main
```
## Audio features
### Text-to-speech (TTS)
The game can read all output aloud using [Piper](https://github.com/rhasspy/piper), a fast offline TTS engine. A British English voice model is included in the repo.
```bash
uv run h2g2 --audio
```
Use `--voice /path/to/model.onnx` to use a different Piper voice model.
### Speech-to-text (STT)
You can play the game hands-free using voice input powered by [Vosk](https://alphacephei.com/vosk/), a lightweight offline speech recognition engine. The Vosk model (~50 MB) downloads automatically on first use.
```bash
uv run h2g2 --stt
```
Use `--stt-model /path/to/vosk-model/` to use a different Vosk model.
Combine both flags for full voice interaction:
```bash
uv run h2g2 --audio --stt
```
### Audio prerequisites
STT requires PortAudio for microphone access. Install the system library for your platform before running with `--stt`:
**Linux (Debian/Ubuntu):**
```bash
sudo apt install libportaudio2 portaudio19-dev
```
**macOS:**
```bash
brew install portaudio
```
**Windows:**
PyAudio ships with PortAudio bundled on Windows, so no extra system package is needed. If you run into build issues, install a prebuilt wheel:
```bash
uv pip install pipwin
pipwin install pyaudio
```
After installing the system dependency, sync the Python packages:
```bash
uv sync
```
## What's playable
The Earth opening sequence: wake up in your bedroom, find your gown and aspirin, make your way downstairs, head to the pub, and meet Ford Prefect.
+25 -2
View File
@@ -45,10 +45,11 @@ def pre_handler(verb: str) -> Callable:
class GameLoop:
"""The main game loop — parse, dispatch, clock."""
def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None) -> None:
def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None, stt: object | None = None) -> None:
self.state = state
self.parser = parser
self.tts = tts
self.stt = stt
self._input = InputHistory()
def run(self) -> None:
@@ -62,7 +63,7 @@ class GameLoop:
while state.running:
try:
raw = self._input.input("\n> ")
raw = self._get_input()
except (EOFError, KeyboardInterrupt):
print("\nGoodbye!")
break
@@ -90,6 +91,28 @@ class GameLoop:
if self.tts:
self.tts.speak(text)
def _get_input(self) -> str:
"""Get player input via STT (with keyboard fallback) or keyboard."""
if not self.stt:
return self._input.input("\n> ")
# Wait for TTS to finish so the mic doesn't pick up game speech
if self.tts:
self.tts.wait()
import sys
sys.stdout.write("\n> ")
sys.stdout.flush()
raw = self.stt.listen()
if raw:
print(raw) # echo what was heard
self._input._history.append(raw)
return raw
# Nothing heard — fall back to keyboard
return self._input.input("")
def _execute(self, result: ParseResult) -> None:
"""Execute a parsed command through the dispatch chain."""
state = self.state
+104
View File
@@ -0,0 +1,104 @@
"""Speech-to-text via Vosk — converts microphone input to text commands."""
from __future__ import annotations
import io
import json
import sys
import urllib.request
import zipfile
from pathlib import Path
import vosk
# Suppress Vosk's own log messages
vosk.SetLogLevel(-1)
# Default model info
_MODEL_NAME = "vosk-model-small-en-us-0.15"
_MODEL_URL = f"https://alphacephei.com/vosk/models/{_MODEL_NAME}.zip"
SAMPLE_RATE = 16000
CHUNK_SIZE = 4000 # bytes per read (~0.125s at 16kHz 16-bit mono)
SILENCE_TIMEOUT = 10.0 # seconds of total silence before giving up
def ensure_model(model_path: Path) -> Path:
"""Download the Vosk model if it doesn't exist yet."""
if model_path.exists():
return model_path
print(f"Downloading Vosk speech model (~50 MB) …")
buf = io.BytesIO()
with urllib.request.urlopen(_MODEL_URL) as resp:
total = int(resp.headers.get("Content-Length", 0))
downloaded = 0
while True:
chunk = resp.read(1 << 16)
if not chunk:
break
buf.write(chunk)
downloaded += len(chunk)
if total:
pct = downloaded * 100 // total
print(f"\r {pct}%", end="", flush=True)
print("\r Extracting … ")
buf.seek(0)
with zipfile.ZipFile(buf) as zf:
zf.extractall(model_path.parent)
print(f" Model ready: {model_path}")
return model_path
class STT:
"""Listens for a spoken command via the microphone and returns text."""
def __init__(self, model_path: str | Path) -> None:
model_path = Path(model_path)
ensure_model(model_path)
self._model = vosk.Model(str(model_path))
def listen(self) -> str:
"""Record from the mic until speech + silence, return transcription.
Returns an empty string if nothing is heard within the timeout.
"""
try:
import pyaudio
except ImportError:
print(
"pyaudio is required for speech input. "
"Install it with: uv pip install pyaudio\n"
"(You may also need: sudo apt install libportaudio2 portaudio19-dev)",
file=sys.stderr,
)
return ""
rec = vosk.KaldiRecognizer(self._model, SAMPLE_RATE)
pa = pyaudio.PyAudio()
stream = pa.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE // 2, # 16-bit = 2 bytes/sample
)
try:
elapsed = 0.0
chunk_seconds = (CHUNK_SIZE / 2) / SAMPLE_RATE # samples per chunk / rate
while elapsed < SILENCE_TIMEOUT:
data = stream.read(CHUNK_SIZE // 2, exception_on_overflow=False)
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
text = result.get("text", "").strip()
if text:
return text
elapsed += chunk_seconds
finally:
stream.stop_stream()
stream.close()
pa.terminate()
# Check partial result on timeout
result = json.loads(rec.FinalResult())
return result.get("text", "").strip()
+5
View File
@@ -49,6 +49,11 @@ class TTS:
self._thread = threading.Thread(target=self._play_wav, args=(wav_data,), daemon=True)
self._thread.start()
def wait(self) -> None:
"""Block until any in-progress playback finishes."""
if self._thread and self._thread.is_alive():
self._thread.join()
@staticmethod
def _play_wav(wav_data: bytes) -> None:
try:
+17 -2
View File
@@ -17,8 +17,9 @@ import h2g2.engine.verbs # noqa: F401
# Import content modules
from h2g2.content import globals_content, earth, vogon, heart, unearth, dark
# Default voice model location (project root)
# Default model locations (project root)
_DEFAULT_VOICE = Path(__file__).resolve().parent.parent / "en_GB-alan-medium.onnx"
_DEFAULT_STT_MODEL = Path(__file__).resolve().parent.parent / "vosk-model-small-en-us-0.15"
def main() -> None:
@@ -31,6 +32,14 @@ def main() -> None:
"--voice", type=Path, default=_DEFAULT_VOICE,
help="Path to Piper .onnx voice model",
)
ap.add_argument(
"--stt", action="store_true",
help="Enable speech-to-text input (requires microphone)",
)
ap.add_argument(
"--stt-model", type=Path, default=_DEFAULT_STT_MODEL,
help="Path to Vosk model directory",
)
args = ap.parse_args()
# Build the world
@@ -109,6 +118,12 @@ def main() -> None:
from h2g2.engine.tts import TTS
tts = TTS(args.voice)
# Initialize STT if requested
stt = None
if args.stt:
from h2g2.engine.stt import STT
stt = STT(args.stt_model)
banner = output.flush()
print(banner, end="")
if tts:
@@ -116,7 +131,7 @@ def main() -> None:
# Run the game loop
parser = Parser()
loop = GameLoop(state, parser, tts=tts)
loop = GameLoop(state, parser, tts=tts, stt=stt)
state.flags["_game_loop"] = loop
loop.run()
+2
View File
@@ -6,6 +6,8 @@ readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"piper-tts>=1.2.0",
"vosk>=0.3.45",
"pyaudio>=0.2.14",
]
[build-system]