[add] Speech to text feature.
This commit is contained in:
@@ -2,3 +2,4 @@ __pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
save.qzl
|
||||
vosk-model-*/
|
||||
|
||||
@@ -31,6 +31,65 @@ You can also run it directly as a module without installing:
|
||||
uv run python -m h2g2.main
|
||||
```
|
||||
|
||||
## Audio features
|
||||
|
||||
### Text-to-speech (TTS)
|
||||
|
||||
The game can read all output aloud using [Piper](https://github.com/rhasspy/piper), a fast offline TTS engine. A British English voice model is included in the repo.
|
||||
|
||||
```bash
|
||||
uv run h2g2 --audio
|
||||
```
|
||||
|
||||
Use `--voice /path/to/model.onnx` to use a different Piper voice model.
|
||||
|
||||
### Speech-to-text (STT)
|
||||
|
||||
You can play the game hands-free using voice input powered by [Vosk](https://alphacephei.com/vosk/), a lightweight offline speech recognition engine. The Vosk model (~50 MB) downloads automatically on first use.
|
||||
|
||||
```bash
|
||||
uv run h2g2 --stt
|
||||
```
|
||||
|
||||
Use `--stt-model /path/to/vosk-model/` to use a different Vosk model.
|
||||
|
||||
Combine both flags for full voice interaction:
|
||||
|
||||
```bash
|
||||
uv run h2g2 --audio --stt
|
||||
```
|
||||
|
||||
### Audio prerequisites
|
||||
|
||||
STT requires PortAudio for microphone access. Install the system library for your platform before running with `--stt`:
|
||||
|
||||
**Linux (Debian/Ubuntu):**
|
||||
|
||||
```bash
|
||||
sudo apt install libportaudio2 portaudio19-dev
|
||||
```
|
||||
|
||||
**macOS:**
|
||||
|
||||
```bash
|
||||
brew install portaudio
|
||||
```
|
||||
|
||||
**Windows:**
|
||||
|
||||
PyAudio ships with PortAudio bundled on Windows, so no extra system package is needed. If you run into build issues, install a prebuilt wheel:
|
||||
|
||||
```bash
|
||||
uv pip install pipwin
|
||||
pipwin install pyaudio
|
||||
```
|
||||
|
||||
After installing the system dependency, sync the Python packages:
|
||||
|
||||
```bash
|
||||
uv sync
|
||||
```
|
||||
|
||||
## What's playable
|
||||
|
||||
The Earth opening sequence: wake up in your bedroom, find your gown and aspirin, make your way downstairs, head to the pub, and meet Ford Prefect.
|
||||
|
||||
+25
-2
@@ -45,10 +45,11 @@ def pre_handler(verb: str) -> Callable:
|
||||
class GameLoop:
|
||||
"""The main game loop — parse, dispatch, clock."""
|
||||
|
||||
def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None) -> None:
|
||||
def __init__(self, state: GameState, parser: Parser, *, tts: object | None = None, stt: object | None = None) -> None:
|
||||
self.state = state
|
||||
self.parser = parser
|
||||
self.tts = tts
|
||||
self.stt = stt
|
||||
self._input = InputHistory()
|
||||
|
||||
def run(self) -> None:
|
||||
@@ -62,7 +63,7 @@ class GameLoop:
|
||||
|
||||
while state.running:
|
||||
try:
|
||||
raw = self._input.input("\n> ")
|
||||
raw = self._get_input()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print("\nGoodbye!")
|
||||
break
|
||||
@@ -90,6 +91,28 @@ class GameLoop:
|
||||
if self.tts:
|
||||
self.tts.speak(text)
|
||||
|
||||
def _get_input(self) -> str:
|
||||
"""Get player input via STT (with keyboard fallback) or keyboard."""
|
||||
if not self.stt:
|
||||
return self._input.input("\n> ")
|
||||
|
||||
# Wait for TTS to finish so the mic doesn't pick up game speech
|
||||
if self.tts:
|
||||
self.tts.wait()
|
||||
|
||||
import sys
|
||||
sys.stdout.write("\n> ")
|
||||
sys.stdout.flush()
|
||||
|
||||
raw = self.stt.listen()
|
||||
if raw:
|
||||
print(raw) # echo what was heard
|
||||
self._input._history.append(raw)
|
||||
return raw
|
||||
|
||||
# Nothing heard — fall back to keyboard
|
||||
return self._input.input("")
|
||||
|
||||
def _execute(self, result: ParseResult) -> None:
|
||||
"""Execute a parsed command through the dispatch chain."""
|
||||
state = self.state
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
"""Speech-to-text via Vosk — converts microphone input to text commands."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import vosk
|
||||
|
||||
# Suppress Vosk's own log messages
|
||||
vosk.SetLogLevel(-1)
|
||||
|
||||
# Default model info
|
||||
_MODEL_NAME = "vosk-model-small-en-us-0.15"
|
||||
_MODEL_URL = f"https://alphacephei.com/vosk/models/{_MODEL_NAME}.zip"
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
CHUNK_SIZE = 4000 # bytes per read (~0.125s at 16kHz 16-bit mono)
|
||||
SILENCE_TIMEOUT = 10.0 # seconds of total silence before giving up
|
||||
|
||||
|
||||
def ensure_model(model_path: Path) -> Path:
|
||||
"""Download the Vosk model if it doesn't exist yet."""
|
||||
if model_path.exists():
|
||||
return model_path
|
||||
print(f"Downloading Vosk speech model (~50 MB) …")
|
||||
buf = io.BytesIO()
|
||||
with urllib.request.urlopen(_MODEL_URL) as resp:
|
||||
total = int(resp.headers.get("Content-Length", 0))
|
||||
downloaded = 0
|
||||
while True:
|
||||
chunk = resp.read(1 << 16)
|
||||
if not chunk:
|
||||
break
|
||||
buf.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total:
|
||||
pct = downloaded * 100 // total
|
||||
print(f"\r {pct}%", end="", flush=True)
|
||||
print("\r Extracting … ")
|
||||
buf.seek(0)
|
||||
with zipfile.ZipFile(buf) as zf:
|
||||
zf.extractall(model_path.parent)
|
||||
print(f" Model ready: {model_path}")
|
||||
return model_path
|
||||
|
||||
|
||||
class STT:
|
||||
"""Listens for a spoken command via the microphone and returns text."""
|
||||
|
||||
def __init__(self, model_path: str | Path) -> None:
|
||||
model_path = Path(model_path)
|
||||
ensure_model(model_path)
|
||||
self._model = vosk.Model(str(model_path))
|
||||
|
||||
def listen(self) -> str:
|
||||
"""Record from the mic until speech + silence, return transcription.
|
||||
|
||||
Returns an empty string if nothing is heard within the timeout.
|
||||
"""
|
||||
try:
|
||||
import pyaudio
|
||||
except ImportError:
|
||||
print(
|
||||
"pyaudio is required for speech input. "
|
||||
"Install it with: uv pip install pyaudio\n"
|
||||
"(You may also need: sudo apt install libportaudio2 portaudio19-dev)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return ""
|
||||
|
||||
rec = vosk.KaldiRecognizer(self._model, SAMPLE_RATE)
|
||||
pa = pyaudio.PyAudio()
|
||||
stream = pa.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK_SIZE // 2, # 16-bit = 2 bytes/sample
|
||||
)
|
||||
|
||||
try:
|
||||
elapsed = 0.0
|
||||
chunk_seconds = (CHUNK_SIZE / 2) / SAMPLE_RATE # samples per chunk / rate
|
||||
while elapsed < SILENCE_TIMEOUT:
|
||||
data = stream.read(CHUNK_SIZE // 2, exception_on_overflow=False)
|
||||
if rec.AcceptWaveform(data):
|
||||
result = json.loads(rec.Result())
|
||||
text = result.get("text", "").strip()
|
||||
if text:
|
||||
return text
|
||||
elapsed += chunk_seconds
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
pa.terminate()
|
||||
|
||||
# Check partial result on timeout
|
||||
result = json.loads(rec.FinalResult())
|
||||
return result.get("text", "").strip()
|
||||
@@ -49,6 +49,11 @@ class TTS:
|
||||
self._thread = threading.Thread(target=self._play_wav, args=(wav_data,), daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def wait(self) -> None:
|
||||
"""Block until any in-progress playback finishes."""
|
||||
if self._thread and self._thread.is_alive():
|
||||
self._thread.join()
|
||||
|
||||
@staticmethod
|
||||
def _play_wav(wav_data: bytes) -> None:
|
||||
try:
|
||||
|
||||
+17
-2
@@ -17,8 +17,9 @@ import h2g2.engine.verbs # noqa: F401
|
||||
# Import content modules
|
||||
from h2g2.content import globals_content, earth, vogon, heart, unearth, dark
|
||||
|
||||
# Default voice model location (project root)
|
||||
# Default model locations (project root)
|
||||
_DEFAULT_VOICE = Path(__file__).resolve().parent.parent / "en_GB-alan-medium.onnx"
|
||||
_DEFAULT_STT_MODEL = Path(__file__).resolve().parent.parent / "vosk-model-small-en-us-0.15"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@@ -31,6 +32,14 @@ def main() -> None:
|
||||
"--voice", type=Path, default=_DEFAULT_VOICE,
|
||||
help="Path to Piper .onnx voice model",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--stt", action="store_true",
|
||||
help="Enable speech-to-text input (requires microphone)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--stt-model", type=Path, default=_DEFAULT_STT_MODEL,
|
||||
help="Path to Vosk model directory",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
# Build the world
|
||||
@@ -109,6 +118,12 @@ def main() -> None:
|
||||
from h2g2.engine.tts import TTS
|
||||
tts = TTS(args.voice)
|
||||
|
||||
# Initialize STT if requested
|
||||
stt = None
|
||||
if args.stt:
|
||||
from h2g2.engine.stt import STT
|
||||
stt = STT(args.stt_model)
|
||||
|
||||
banner = output.flush()
|
||||
print(banner, end="")
|
||||
if tts:
|
||||
@@ -116,7 +131,7 @@ def main() -> None:
|
||||
|
||||
# Run the game loop
|
||||
parser = Parser()
|
||||
loop = GameLoop(state, parser, tts=tts)
|
||||
loop = GameLoop(state, parser, tts=tts, stt=stt)
|
||||
state.flags["_game_loop"] = loop
|
||||
loop.run()
|
||||
|
||||
|
||||
@@ -6,6 +6,8 @@ readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"piper-tts>=1.2.0",
|
||||
"vosk>=0.3.45",
|
||||
"pyaudio>=0.2.14",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
||||
Reference in New Issue
Block a user