[add] Speech to text feature.

2026-04-05 16:07:03 +02:00
parent 049d4f2a18
commit 81e9760e5c
7 changed files with 213 additions and 4 deletions
@@ -2,3 +2,4 @@ __pycache__/
 *.pyc
 *.pyo
 save.qzl
+vosk-model-*/
@@ -31,6 +31,65 @@ You can also run it directly as a module without installing:
 uv run python -m h2g2.main
 ```

+## Audio features
+
+### Text-to-speech (TTS)
+
+The game can read all output aloud using [Piper](https://github.com/rhasspy/piper), a fast offline TTS engine. A British English voice model is included in the repo.
+
+```bash
+uv run h2g2 --audio
+```
+
+Use `--voice /path/to/model.onnx` to use a different Piper voice model.
+
+### Speech-to-text (STT)
+
+You can play the game hands-free using voice input powered by [Vosk](https://alphacephei.com/vosk/), a lightweight offline speech recognition engine. The Vosk model (~50 MB) downloads automatically on first use.
+
+```bash
+uv run h2g2 --stt
+```
+
+Use `--stt-model /path/to/vosk-model/` to use a different Vosk model.
+
+Combine both flags for full voice interaction:
+
+```bash
+uv run h2g2 --audio --stt
+```
+
+### Audio prerequisites
+
+STT requires PortAudio for microphone access. Install the system library for your platform before running with `--stt`:
+
+**Linux (Debian/Ubuntu):**
+
+```bash
+sudo apt install libportaudio2 portaudio19-dev
+```
+
+**macOS:**
+
+```bash
+brew install portaudio
+```
+
+**Windows:**
+
+PyAudio ships with PortAudio bundled on Windows, so no extra system package is needed. If you run into build issues, install a prebuilt wheel:
+
+```bash
+uv pip install pipwin
+pipwin install pyaudio
+```
+
+After installing the system dependency, sync the Python packages:
+
+```bash
+uv sync
+```
+
 ## What's playable

 The Earth opening sequence: wake up in your bedroom, find your gown and aspirin, make your way downstairs, head to the pub, and meet Ford Prefect.
@@ -45,10 +45,11 @@ def pre_handler(verb: str) -> Callable:
 class GameLoop:
    """The main game loop — parse, dispatch, clock."""

-    def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None) -> None:
+    def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None, stt: object | None = None) -> None:
        self.state = state
        self.parser = parser
        self.tts = tts
+        self.stt = stt
        self._input = InputHistory()

    def run(self) -> None:
@@ -62,7 +63,7 @@ class GameLoop:

        while state.running:
            try:
-                raw = self._input.input("\n> ")
+                raw = self._get_input()
            except (EOFError, KeyboardInterrupt):
                print("\nGoodbye!")
                break
@@ -90,6 +91,28 @@ class GameLoop:
            if self.tts:
                self.tts.speak(text)

+    def _get_input(self) -> str:
+        """Get player input via STT (with keyboard fallback) or keyboard."""
+        if not self.stt:
+            return self._input.input("\n> ")
+
+        # Wait for TTS to finish so the mic doesn't pick up game speech
+        if self.tts:
+            self.tts.wait()
+
+        import sys
+        sys.stdout.write("\n> ")
+        sys.stdout.flush()
+
+        raw = self.stt.listen()
+        if raw:
+            print(raw)  # echo what was heard
+            self._input._history.append(raw)
+            return raw
+
+        # Nothing heard — fall back to keyboard
+        return self._input.input("")
+
    def _execute(self, result: ParseResult) -> None:
        """Execute a parsed command through the dispatch chain."""
        state = self.state
@@ -0,0 +1,104 @@
+"""Speech-to-text via Vosk — converts microphone input to text commands."""
+
+from __future__ import annotations
+
+import io
+import json
+import sys
+import urllib.request
+import zipfile
+from pathlib import Path
+
+import vosk
+
+# Suppress Vosk's own log messages
+vosk.SetLogLevel(-1)
+
+# Default model info
+_MODEL_NAME = "vosk-model-small-en-us-0.15"
+_MODEL_URL = f"https://alphacephei.com/vosk/models/{_MODEL_NAME}.zip"
+
+SAMPLE_RATE = 16000
+CHUNK_SIZE = 4000  # bytes per read (~0.125s at 16kHz 16-bit mono)
+SILENCE_TIMEOUT = 10.0  # seconds of total silence before giving up
+
+
+def ensure_model(model_path: Path) -> Path:
+    """Download the Vosk model if it doesn't exist yet."""
+    if model_path.exists():
+        return model_path
+    print(f"Downloading Vosk speech model (~50 MB) …")
+    buf = io.BytesIO()
+    with urllib.request.urlopen(_MODEL_URL) as resp:
+        total = int(resp.headers.get("Content-Length", 0))
+        downloaded = 0
+        while True:
+            chunk = resp.read(1 << 16)
+            if not chunk:
+                break
+            buf.write(chunk)
+            downloaded += len(chunk)
+            if total:
+                pct = downloaded * 100 // total
+                print(f"\r  {pct}%", end="", flush=True)
+    print("\r  Extracting …    ")
+    buf.seek(0)
+    with zipfile.ZipFile(buf) as zf:
+        zf.extractall(model_path.parent)
+    print(f"  Model ready: {model_path}")
+    return model_path
+
+
+class STT:
+    """Listens for a spoken command via the microphone and returns text."""
+
+    def __init__(self, model_path: str | Path) -> None:
+        model_path = Path(model_path)
+        ensure_model(model_path)
+        self._model = vosk.Model(str(model_path))
+
+    def listen(self) -> str:
+        """Record from the mic until speech + silence, return transcription.
+
+        Returns an empty string if nothing is heard within the timeout.
+        """
+        try:
+            import pyaudio
+        except ImportError:
+            print(
+                "pyaudio is required for speech input. "
+                "Install it with: uv pip install pyaudio\n"
+                "(You may also need: sudo apt install libportaudio2 portaudio19-dev)",
+                file=sys.stderr,
+            )
+            return ""
+
+        rec = vosk.KaldiRecognizer(self._model, SAMPLE_RATE)
+        pa = pyaudio.PyAudio()
+        stream = pa.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=SAMPLE_RATE,
+            input=True,
+            frames_per_buffer=CHUNK_SIZE // 2,  # 16-bit = 2 bytes/sample
+        )
+
+        try:
+            elapsed = 0.0
+            chunk_seconds = (CHUNK_SIZE / 2) / SAMPLE_RATE  # samples per chunk / rate
+            while elapsed < SILENCE_TIMEOUT:
+                data = stream.read(CHUNK_SIZE // 2, exception_on_overflow=False)
+                if rec.AcceptWaveform(data):
+                    result = json.loads(rec.Result())
+                    text = result.get("text", "").strip()
+                    if text:
+                        return text
+                elapsed += chunk_seconds
+        finally:
+            stream.stop_stream()
+            stream.close()
+            pa.terminate()
+
+        # Check partial result on timeout
+        result = json.loads(rec.FinalResult())
+        return result.get("text", "").strip()
@@ -49,6 +49,11 @@ class TTS:
        self._thread = threading.Thread(target=self._play_wav, args=(wav_data,), daemon=True)
        self._thread.start()

+    def wait(self) -> None:
+        """Block until any in-progress playback finishes."""
+        if self._thread and self._thread.is_alive():
+            self._thread.join()
+
    @staticmethod
    def _play_wav(wav_data: bytes) -> None:
        try:
@@ -17,8 +17,9 @@ import h2g2.engine.verbs  # noqa: F401
 # Import content modules
 from h2g2.content import globals_content, earth, vogon, heart, unearth, dark

-# Default voice model location (project root)
+# Default model locations (project root)
 _DEFAULT_VOICE = Path(__file__).resolve().parent.parent / "en_GB-alan-medium.onnx"
+_DEFAULT_STT_MODEL = Path(__file__).resolve().parent.parent / "vosk-model-small-en-us-0.15"


 def main() -> None:
@@ -31,6 +32,14 @@ def main() -> None:
        "--voice", type=Path, default=_DEFAULT_VOICE,
        help="Path to Piper .onnx voice model",
    )
+    ap.add_argument(
+        "--stt", action="store_true",
+        help="Enable speech-to-text input (requires microphone)",
+    )
+    ap.add_argument(
+        "--stt-model", type=Path, default=_DEFAULT_STT_MODEL,
+        help="Path to Vosk model directory",
+    )
    args = ap.parse_args()

    # Build the world
@@ -109,6 +118,12 @@ def main() -> None:
        from h2g2.engine.tts import TTS
        tts = TTS(args.voice)

+    # Initialize STT if requested
+    stt = None
+    if args.stt:
+        from h2g2.engine.stt import STT
+        stt = STT(args.stt_model)
+
    banner = output.flush()
    print(banner, end="")
    if tts:
@@ -116,7 +131,7 @@ def main() -> None:

    # Run the game loop
    parser = Parser()
-    loop = GameLoop(state, parser, tts=tts)
+    loop = GameLoop(state, parser, tts=tts, stt=stt)
    state.flags["_game_loop"] = loop
    loop.run()

@@ -6,6 +6,8 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "piper-tts>=1.2.0",
+    "vosk>=0.3.45",
+    "pyaudio>=0.2.14",
 ]

 [build-system]