Merge pull request #53 from pollen-robotics/more_cleanup
Browse files- .env.example +3 -0
- .gitignore +36 -181
- README.md +9 -8
- pyproject.toml +1 -0
- src/reachy_mini_conversation_demo/audio/speech_tapper.py +0 -13
- src/reachy_mini_conversation_demo/config.py +30 -9
- src/reachy_mini_conversation_demo/console.py +18 -15
- src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif +3 -0
- src/reachy_mini_conversation_demo/moves.py +3 -22
- src/reachy_mini_conversation_demo/openai_realtime.py +19 -17
- src/reachy_mini_conversation_demo/prompts.py +1 -0
- src/reachy_mini_conversation_demo/tools.py +28 -231
- src/reachy_mini_conversation_demo/utils.py +9 -7
- src/reachy_mini_conversation_demo/vision/processors.py +66 -118
- src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +0 -86
- uv.lock +0 -0
.env.example
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
OPENAI_API_KEY=
|
| 2 |
MODEL_NAME="gpt-realtime"
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
# Cache for local VLM
|
| 5 |
HF_HOME=./cache
|
| 6 |
|
|
|
|
| 1 |
OPENAI_API_KEY=
|
| 2 |
MODEL_NAME="gpt-realtime"
|
| 3 |
|
| 4 |
+
# Local vision model
|
| 5 |
+
LOCAL_VISION_MODEL=HuggingFaceTB/SmolVLM2-2.2B-Instruct
|
| 6 |
+
|
| 7 |
# Cache for local VLM
|
| 8 |
HF_HOME=./cache
|
| 9 |
|
.gitignore
CHANGED
|
@@ -1,202 +1,57 @@
|
|
| 1 |
-
#
|
| 2 |
-
.DS_Store
|
| 3 |
-
.AppleDouble
|
| 4 |
-
.LSOverride
|
| 5 |
-
|
| 6 |
-
# Icon must end with two \r
|
| 7 |
-
Icon
|
| 8 |
-
|
| 9 |
-
# Thumbnails
|
| 10 |
-
._*
|
| 11 |
-
|
| 12 |
-
# Files that might appear in the root of a volume
|
| 13 |
-
.DocumentRevisions-V100
|
| 14 |
-
.fseventsd
|
| 15 |
-
.Spotlight-V100
|
| 16 |
-
.TemporaryItems
|
| 17 |
-
.Trashes
|
| 18 |
-
.VolumeIcon.icns
|
| 19 |
-
.com.apple.timemachine.donotpresent
|
| 20 |
-
|
| 21 |
-
# Directories potentially created on remote AFP share
|
| 22 |
-
.AppleDB
|
| 23 |
-
.AppleDesktop
|
| 24 |
-
Network Trash Folder
|
| 25 |
-
Temporary Items
|
| 26 |
-
.apdisk
|
| 27 |
-
|
| 28 |
-
# Byte-compiled / optimized / DLL files
|
| 29 |
__pycache__/
|
| 30 |
*.py[cod]
|
| 31 |
*$py.class
|
| 32 |
-
|
| 33 |
-
# C extensions
|
| 34 |
*.so
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
build/
|
| 39 |
-
develop-eggs/
|
| 40 |
dist/
|
| 41 |
-
downloads/
|
| 42 |
-
eggs/
|
| 43 |
-
.eggs/
|
| 44 |
-
lib/
|
| 45 |
-
lib64/
|
| 46 |
-
parts/
|
| 47 |
-
sdist/
|
| 48 |
-
var/
|
| 49 |
-
wheels/
|
| 50 |
-
share/python-wheels/
|
| 51 |
*.egg-info/
|
| 52 |
-
.
|
| 53 |
-
*.egg
|
| 54 |
-
MANIFEST
|
| 55 |
-
|
| 56 |
-
# PyInstaller
|
| 57 |
-
# Usually these files are written by a python script from a template
|
| 58 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 59 |
-
*.manifest
|
| 60 |
-
*.spec
|
| 61 |
-
|
| 62 |
-
# Installer logs
|
| 63 |
-
pip-log.txt
|
| 64 |
-
pip-delete-this-directory.txt
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
.tox/
|
| 69 |
-
.nox/
|
| 70 |
.coverage
|
| 71 |
-
.
|
| 72 |
-
|
| 73 |
-
cache/
|
| 74 |
-
nosetests.xml
|
| 75 |
coverage.xml
|
| 76 |
*.cover
|
| 77 |
-
*.py,cover
|
| 78 |
-
.hypothesis/
|
| 79 |
-
.pytest_cache/
|
| 80 |
-
cover/
|
| 81 |
|
| 82 |
-
#
|
| 83 |
.ruff_cache/
|
| 84 |
|
| 85 |
-
#
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
*.log
|
| 91 |
-
local_settings.py
|
| 92 |
-
db.sqlite3
|
| 93 |
-
db.sqlite3-journal
|
| 94 |
-
|
| 95 |
-
# Flask stuff:
|
| 96 |
-
instance/
|
| 97 |
-
.webassets-cache
|
| 98 |
-
|
| 99 |
-
# Scrapy stuff:
|
| 100 |
-
.scrapy
|
| 101 |
|
| 102 |
-
#
|
| 103 |
-
docs/_build/
|
| 104 |
-
|
| 105 |
-
# PyBuilder
|
| 106 |
-
.pybuilder/
|
| 107 |
-
target/
|
| 108 |
-
|
| 109 |
-
# Jupyter Notebook
|
| 110 |
-
.ipynb_checkpoints
|
| 111 |
-
|
| 112 |
-
# IPython
|
| 113 |
-
profile_default/
|
| 114 |
-
ipython_config.py
|
| 115 |
-
|
| 116 |
-
# pyenv
|
| 117 |
-
# For a library or package, you might want to ignore these files since the code is
|
| 118 |
-
# intended to run in multiple environments; otherwise, check them in:
|
| 119 |
-
# .python-version
|
| 120 |
-
|
| 121 |
-
# pipenv
|
| 122 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 123 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 124 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 125 |
-
# install all needed dependencies.
|
| 126 |
-
#Pipfile.lock
|
| 127 |
-
|
| 128 |
-
# poetry
|
| 129 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 130 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 131 |
-
# commonly ignored for libraries.
|
| 132 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 133 |
-
#poetry.lock
|
| 134 |
-
|
| 135 |
-
# pdm
|
| 136 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 137 |
-
#pdm.lock
|
| 138 |
-
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 139 |
-
# in version control.
|
| 140 |
-
# https://pdm.fming.dev/#use-with-ide
|
| 141 |
-
.pdm.toml
|
| 142 |
-
|
| 143 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 144 |
-
__pypackages__/
|
| 145 |
-
|
| 146 |
-
# Celery stuff
|
| 147 |
-
celerybeat-schedule
|
| 148 |
-
celerybeat.pid
|
| 149 |
-
|
| 150 |
-
# SageMath parsed files
|
| 151 |
-
*.sage.py
|
| 152 |
-
|
| 153 |
-
# Environments
|
| 154 |
-
.env
|
| 155 |
-
.venv
|
| 156 |
-
env/
|
| 157 |
-
venv/
|
| 158 |
-
ENV/
|
| 159 |
-
env.bak/
|
| 160 |
-
venv.bak/
|
| 161 |
-
|
| 162 |
-
# Keys
|
| 163 |
-
*.csr
|
| 164 |
*.key
|
| 165 |
*.pem
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
#
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
# Rope project settings
|
| 172 |
-
.ropeproject
|
| 173 |
-
|
| 174 |
-
# mkdocs documentation
|
| 175 |
-
/site
|
| 176 |
-
|
| 177 |
-
# mypy
|
| 178 |
-
.mypy_cache/
|
| 179 |
-
.dmypy.json
|
| 180 |
-
dmypy.json
|
| 181 |
-
|
| 182 |
-
# Pyre type checker
|
| 183 |
-
.pyre/
|
| 184 |
-
|
| 185 |
-
# pytype static type analyzer
|
| 186 |
-
.pytype/
|
| 187 |
-
|
| 188 |
-
# Cython debug symbols
|
| 189 |
-
cython_debug/
|
| 190 |
-
|
| 191 |
-
# PyCharm
|
| 192 |
-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 193 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 194 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 195 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 196 |
-
#.idea/
|
| 197 |
|
| 198 |
-
#
|
| 199 |
-
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
*$py.class
|
|
|
|
|
|
|
| 5 |
*.so
|
| 6 |
|
| 7 |
+
# Virtual environments
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
ENV/
|
| 11 |
+
env/
|
| 12 |
+
|
| 13 |
+
# Environment variables
|
| 14 |
+
.env
|
| 15 |
+
|
| 16 |
+
# Build and distribution
|
| 17 |
build/
|
|
|
|
| 18 |
dist/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
*.egg-info/
|
| 20 |
+
.eggs/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# Testing
|
| 23 |
+
.pytest_cache/
|
|
|
|
|
|
|
| 24 |
.coverage
|
| 25 |
+
.hypothesis/
|
| 26 |
+
htmlcov/
|
|
|
|
|
|
|
| 27 |
coverage.xml
|
| 28 |
*.cover
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# Linting and formatting
|
| 31 |
.ruff_cache/
|
| 32 |
|
| 33 |
+
# IDE
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Security
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
*.key
|
| 41 |
*.pem
|
| 42 |
+
*.crt
|
| 43 |
+
*.csr
|
| 44 |
|
| 45 |
+
# Temporary files
|
| 46 |
+
tmp/
|
| 47 |
+
*.log
|
| 48 |
+
cache/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
# macOS
|
| 51 |
+
.DS_Store
|
| 52 |
|
| 53 |
+
# Linux
|
| 54 |
+
*~
|
| 55 |
+
.directory
|
| 56 |
+
.Trash-*
|
| 57 |
+
.nfs*
|
README.md
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
# Reachy Mini conversation demo
|
| 2 |
|
| 3 |
Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
|
|
|
|
| 4 |
|
| 5 |
## Overview
|
| 6 |
- Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
|
| 7 |
-
-
|
| 8 |
- Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
|
| 9 |
-
- Async tool dispatch integrates robot motion, camera capture, and optional
|
| 10 |
|
| 11 |
## Installation
|
| 12 |
|
|
@@ -74,8 +75,9 @@ Some wheels (e.g. PyTorch) are large and require compatible CUDA or CPU builds
|
|
| 74 |
|----------|-------------|
|
| 75 |
| `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
|
| 76 |
| `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
|
| 77 |
-
| `HF_HOME` | Cache directory for local Hugging Face downloads.
|
| 78 |
-
| `HF_TOKEN` | Optional token for Hugging Face models.
|
|
|
|
| 79 |
|
| 80 |
## Running the demo
|
| 81 |
|
|
@@ -85,13 +87,13 @@ Activate your virtual environment, ensure the Reachy Mini robot (or simulator) i
|
|
| 85 |
reachy-mini-conversation-demo
|
| 86 |
```
|
| 87 |
|
| 88 |
-
|
| 89 |
|
| 90 |
### CLI options
|
| 91 |
|
| 92 |
| Option | Default | Description |
|
| 93 |
|--------|---------|-------------|
|
| 94 |
-
| `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. Requires the matching optional extra. |
|
| 95 |
| `--no-camera` | `False` | Run without camera capture or face tracking. |
|
| 96 |
| `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
|
| 97 |
| `--debug` | `False` | Enable verbose logging for troubleshooting. |
|
|
@@ -116,12 +118,11 @@ The app starts a Gradio UI served locally (http://127.0.0.1:7860/). When running
|
|
| 116 |
|------|--------|--------------|
|
| 117 |
| `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
|
| 118 |
| `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
|
| 119 |
-
| `head_tracking` | Enable or disable face-tracking offsets. | Camera worker with configured head tracker. |
|
| 120 |
| `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
|
| 121 |
| `stop_dance` | Clear queued dances. | Core install only. |
|
| 122 |
| `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
|
| 123 |
| `stop_emotion` | Clear queued emotions. | Core install only. |
|
| 124 |
-
| `get_person_name` | DeepFace-based recognition of the current person. | Disabled by default (`ENABLE_FACE_RECOGNITION=False`); requires `deepface` and a local face database. |
|
| 125 |
| `do_nothing` | Explicitly remain idle. | Core install only. |
|
| 126 |
|
| 127 |
## Development workflow
|
|
|
|
| 1 |
# Reachy Mini conversation demo
|
| 2 |
|
| 3 |
Conversational demo for the Reachy Mini robot combining OpenAI's realtime APIs, vision pipelines, and choreographed motion libraries.
|
| 4 |
+

|
| 5 |
|
| 6 |
## Overview
|
| 7 |
- Real-time audio conversation loop powered by the OpenAI realtime API and `fastrtc` for low-latency streaming.
|
| 8 |
+
- Local vision processing using SmolVLM2 model running on-device (CPU/GPU/MPS).
|
| 9 |
- Layered motion system queues primary moves (dances, emotions, goto poses, breathing) while blending speech-reactive wobble and face-tracking.
|
| 10 |
+
- Async tool dispatch integrates robot motion, camera capture, and optional face-tracking capabilities through a Gradio web UI with live transcripts.
|
| 11 |
|
| 12 |
## Installation
|
| 13 |
|
|
|
|
| 75 |
|----------|-------------|
|
| 76 |
| `OPENAI_API_KEY` | Required. Grants access to the OpenAI realtime endpoint.
|
| 77 |
| `MODEL_NAME` | Override the realtime model (defaults to `gpt-realtime`).
|
| 78 |
+
| `HF_HOME` | Cache directory for local Hugging Face downloads (defaults to `./cache`).
|
| 79 |
+
| `HF_TOKEN` | Optional token for Hugging Face models (falls back to `huggingface-cli login`).
|
| 80 |
+
| `LOCAL_VISION_MODEL` | Hugging Face model path for local vision processing (defaults to `HuggingFaceTB/SmolVLM2-2.2B-Instruct`).
|
| 81 |
|
| 82 |
## Running the demo
|
| 83 |
|
|
|
|
| 87 |
reachy-mini-conversation-demo
|
| 88 |
```
|
| 89 |
|
| 90 |
+
By default, the app runs in console mode for direct audio interaction. Use the `--gradio` flag to launch a web UI served locally at http://127.0.0.1:7860/ (required when running in simulation mode). With a camera attached, captured frames are analyzed locally using the SmolVLM2 vision model. Additionally, you can enable face tracking via YOLO or MediaPipe pipelines depending on the extras you installed.
|
| 91 |
|
| 92 |
### CLI options
|
| 93 |
|
| 94 |
| Option | Default | Description |
|
| 95 |
|--------|---------|-------------|
|
| 96 |
+
| `--head-tracker {yolo,mediapipe}` | `None` | Select a face-tracking backend when a camera is available. YOLO is implemented locally, MediaPipe comes from the `reachy_mini_toolbox` package. Requires the matching optional extra. |
|
| 97 |
| `--no-camera` | `False` | Run without camera capture or face tracking. |
|
| 98 |
| `--gradio` | `False` | Launch the Gradio web UI. Without this flag, runs in console mode. Required when running in simulation mode. |
|
| 99 |
| `--debug` | `False` | Enable verbose logging for troubleshooting. |
|
|
|
|
| 118 |
|------|--------|--------------|
|
| 119 |
| `move_head` | Queue a head pose change (left/right/up/down/front). | Core install only. |
|
| 120 |
| `camera` | Capture the latest camera frame and optionally query a vision backend. | Requires camera worker; vision analysis depends on selected extras. |
|
| 121 |
+
| `head_tracking` | Enable or disable face-tracking offsets (not facial recognition - only detects and tracks face position). | Camera worker with configured head tracker. |
|
| 122 |
| `dance` | Queue a dance from `reachy_mini_dances_library`. | Core install only. |
|
| 123 |
| `stop_dance` | Clear queued dances. | Core install only. |
|
| 124 |
| `play_emotion` | Play a recorded emotion clip via Hugging Face assets. | Needs `HF_TOKEN` for the recorded emotions dataset. |
|
| 125 |
| `stop_emotion` | Clear queued emotions. | Core install only. |
|
|
|
|
| 126 |
| `do_nothing` | Explicitly remain idle. | Core install only. |
|
| 127 |
|
| 128 |
## Development workflow
|
pyproject.toml
CHANGED
|
@@ -16,6 +16,7 @@ dependencies = [
|
|
| 16 |
"gradio>=5.49.0",
|
| 17 |
"huggingface_hub>=0.34.4",
|
| 18 |
"opencv-python>=4.12.0.88",
|
|
|
|
| 19 |
|
| 20 |
#Environment variables
|
| 21 |
"python-dotenv",
|
|
|
|
| 16 |
"gradio>=5.49.0",
|
| 17 |
"huggingface_hub>=0.34.4",
|
| 18 |
"opencv-python>=4.12.0.88",
|
| 19 |
+
"num2words",
|
| 20 |
|
| 21 |
#Environment variables
|
| 22 |
"python-dotenv",
|
src/reachy_mini_conversation_demo/audio/speech_tapper.py
CHANGED
|
@@ -120,7 +120,6 @@ class SwayRollRT:
|
|
| 120 |
self._seed = int(rng_seed)
|
| 121 |
self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 122 |
self.carry = np.zeros(0, dtype=np.float32)
|
| 123 |
-
self.frame_idx = 0
|
| 124 |
|
| 125 |
self.vad_on = False
|
| 126 |
self.vad_above = 0
|
|
@@ -143,7 +142,6 @@ class SwayRollRT:
|
|
| 143 |
"""Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
|
| 144 |
self.samples.clear()
|
| 145 |
self.carry = np.zeros(0, dtype=np.float32)
|
| 146 |
-
self.frame_idx = 0
|
| 147 |
self.vad_on = False
|
| 148 |
self.vad_above = 0
|
| 149 |
self.vad_below = 0
|
|
@@ -152,16 +150,6 @@ class SwayRollRT:
|
|
| 152 |
self.sway_down = 0
|
| 153 |
self.t = 0.0
|
| 154 |
|
| 155 |
-
def reset_phases(self) -> None:
|
| 156 |
-
"""Re-randomize phases deterministically from stored seed (Optional)."""
|
| 157 |
-
rng = np.random.default_rng(self._seed)
|
| 158 |
-
self.phase_pitch = float(rng.random() * 2 * math.pi)
|
| 159 |
-
self.phase_yaw = float(rng.random() * 2 * math.pi)
|
| 160 |
-
self.phase_roll = float(rng.random() * 2 * math.pi)
|
| 161 |
-
self.phase_x = float(rng.random() * 2 * math.pi)
|
| 162 |
-
self.phase_y = float(rng.random() * 2 * math.pi)
|
| 163 |
-
self.phase_z = float(rng.random() * 2 * math.pi)
|
| 164 |
-
|
| 165 |
def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
|
| 166 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 167 |
|
|
@@ -196,7 +184,6 @@ class SwayRollRT:
|
|
| 196 |
self.samples.extend(hop.tolist())
|
| 197 |
if len(self.samples) < FRAME:
|
| 198 |
self.t += HOP_MS / 1000.0
|
| 199 |
-
self.frame_idx += 1
|
| 200 |
continue
|
| 201 |
|
| 202 |
frame = np.fromiter(
|
|
|
|
| 120 |
self._seed = int(rng_seed)
|
| 121 |
self.samples = deque(maxlen=10 * SR) # sliding window for VAD/env
|
| 122 |
self.carry = np.zeros(0, dtype=np.float32)
|
|
|
|
| 123 |
|
| 124 |
self.vad_on = False
|
| 125 |
self.vad_above = 0
|
|
|
|
| 142 |
"""Reset state (VAD/env/buffers/time) but keep initial phases/seed."""
|
| 143 |
self.samples.clear()
|
| 144 |
self.carry = np.zeros(0, dtype=np.float32)
|
|
|
|
| 145 |
self.vad_on = False
|
| 146 |
self.vad_above = 0
|
| 147 |
self.vad_below = 0
|
|
|
|
| 150 |
self.sway_down = 0
|
| 151 |
self.t = 0.0
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
|
| 154 |
"""Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
|
| 155 |
|
|
|
|
| 184 |
self.samples.extend(hop.tolist())
|
| 185 |
if len(self.samples) < FRAME:
|
| 186 |
self.t += HOP_MS / 1000.0
|
|
|
|
| 187 |
continue
|
| 188 |
|
| 189 |
frame = np.fromiter(
|
src/reachy_mini_conversation_demo/config.py
CHANGED
|
@@ -1,17 +1,28 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
| 5 |
|
| 6 |
-
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class Config:
|
|
@@ -19,13 +30,23 @@ class Config:
|
|
| 19 |
|
| 20 |
# Required
|
| 21 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 22 |
-
if
|
| 23 |
-
raise RuntimeError(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Optional
|
| 26 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
|
| 27 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
|
|
|
| 28 |
HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
|
| 29 |
|
|
|
|
|
|
|
| 30 |
|
| 31 |
config = Config()
|
|
|
|
| 1 |
import os
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
+
# Check if .env file exists
|
| 11 |
+
env_file = Path(".env")
|
| 12 |
+
if not env_file.exists():
|
| 13 |
+
raise RuntimeError(
|
| 14 |
+
".env file not found. Please create one based on .env.example:\n"
|
| 15 |
+
" cp .env.example .env\n"
|
| 16 |
+
"Then add your OPENAI_API_KEY to the .env file."
|
| 17 |
+
)
|
| 18 |
|
| 19 |
+
# Load .env and verify it was loaded successfully
|
| 20 |
+
if not load_dotenv():
|
| 21 |
+
raise RuntimeError(
|
| 22 |
+
"Failed to load .env file. Please ensure the file is readable and properly formatted."
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger.info("Configuration loaded from .env file")
|
| 26 |
|
| 27 |
|
| 28 |
class Config:
|
|
|
|
| 30 |
|
| 31 |
# Required
|
| 32 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 33 |
+
if OPENAI_API_KEY is None:
|
| 34 |
+
raise RuntimeError(
|
| 35 |
+
"OPENAI_API_KEY is not set in .env file. Please add it:\n"
|
| 36 |
+
" OPENAI_API_KEY=your_api_key_here"
|
| 37 |
+
)
|
| 38 |
+
if not OPENAI_API_KEY.strip():
|
| 39 |
+
raise RuntimeError(
|
| 40 |
+
"OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
|
| 41 |
+
)
|
| 42 |
|
| 43 |
# Optional
|
| 44 |
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-realtime")
|
| 45 |
HF_HOME = os.getenv("HF_HOME", "./cache")
|
| 46 |
+
LOCAL_VISION_MODEL = os.getenv("LOCAL_VISION_MODEL", "HuggingFaceTB/SmolVLM2-2.2B-Instruct")
|
| 47 |
HF_TOKEN = os.getenv("HF_TOKEN") # Optional, falls back to hf auth login if not set
|
| 48 |
|
| 49 |
+
logger.debug(f"Model: {MODEL_NAME}, HF_HOME: {HF_HOME}, Vision Model: {LOCAL_VISION_MODEL}")
|
| 50 |
+
|
| 51 |
|
| 52 |
config = Config()
|
src/reachy_mini_conversation_demo/console.py
CHANGED
|
@@ -26,7 +26,7 @@ class LocalStream:
|
|
| 26 |
self._stop_event = asyncio.Event()
|
| 27 |
self._tasks = []
|
| 28 |
# Allow the handler to flush the player queue when appropriate.
|
| 29 |
-
self.handler._clear_queue = self.
|
| 30 |
|
| 31 |
def launch(self) -> None:
|
| 32 |
"""Start the recorder/player and run the async processing loops."""
|
|
@@ -69,7 +69,7 @@ class LocalStream:
|
|
| 69 |
self._robot.media.stop_recording()
|
| 70 |
self._robot.media.stop_playing()
|
| 71 |
|
| 72 |
-
def
|
| 73 |
"""Flush the player's appsrc to drop any queued audio immediately."""
|
| 74 |
logger.info("User intervention: flushing player queue")
|
| 75 |
self.handler.output_queue = asyncio.Queue()
|
|
@@ -78,9 +78,9 @@ class LocalStream:
|
|
| 78 |
"""Read mic frames from the recorder and forward them to the handler."""
|
| 79 |
logger.info("Starting receive loop")
|
| 80 |
while not self._stop_event.is_set():
|
| 81 |
-
|
| 82 |
-
if
|
| 83 |
-
frame_mono =
|
| 84 |
frame = audio_to_int16(frame_mono)
|
| 85 |
await self.handler.receive((16000, frame))
|
| 86 |
# await asyncio.sleep(0) # yield to event loop
|
|
@@ -90,10 +90,10 @@ class LocalStream:
|
|
| 90 |
async def play_loop(self) -> None:
|
| 91 |
"""Fetch outputs from the handler: log text and play audio frames."""
|
| 92 |
while not self._stop_event.is_set():
|
| 93 |
-
|
| 94 |
|
| 95 |
-
if isinstance(
|
| 96 |
-
for msg in
|
| 97 |
content = msg.get("content", "")
|
| 98 |
if isinstance(content, str):
|
| 99 |
logger.info(
|
|
@@ -102,14 +102,17 @@ class LocalStream:
|
|
| 102 |
content if len(content) < 500 else content[:500] + "…",
|
| 103 |
)
|
| 104 |
|
| 105 |
-
elif isinstance(
|
| 106 |
-
|
| 107 |
device_sample_rate = self._robot.media.get_audio_samplerate()
|
| 108 |
-
|
| 109 |
-
if
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
await asyncio.sleep(0) # yield to event loop
|
|
|
|
| 26 |
self._stop_event = asyncio.Event()
|
| 27 |
self._tasks = []
|
| 28 |
# Allow the handler to flush the player queue when appropriate.
|
| 29 |
+
self.handler._clear_queue = self.clear_audio_queue # type: ignore[assignment]
|
| 30 |
|
| 31 |
def launch(self) -> None:
|
| 32 |
"""Start the recorder/player and run the async processing loops."""
|
|
|
|
| 69 |
self._robot.media.stop_recording()
|
| 70 |
self._robot.media.stop_playing()
|
| 71 |
|
| 72 |
+
def clear_audio_queue(self) -> None:
|
| 73 |
"""Flush the player's appsrc to drop any queued audio immediately."""
|
| 74 |
logger.info("User intervention: flushing player queue")
|
| 75 |
self.handler.output_queue = asyncio.Queue()
|
|
|
|
| 78 |
"""Read mic frames from the recorder and forward them to the handler."""
|
| 79 |
logger.info("Starting receive loop")
|
| 80 |
while not self._stop_event.is_set():
|
| 81 |
+
audio_frame = self._robot.media.get_audio_sample()
|
| 82 |
+
if audio_frame is not None:
|
| 83 |
+
frame_mono = audio_frame.T[0] # both channels are identical
|
| 84 |
frame = audio_to_int16(frame_mono)
|
| 85 |
await self.handler.receive((16000, frame))
|
| 86 |
# await asyncio.sleep(0) # yield to event loop
|
|
|
|
| 90 |
async def play_loop(self) -> None:
|
| 91 |
"""Fetch outputs from the handler: log text and play audio frames."""
|
| 92 |
while not self._stop_event.is_set():
|
| 93 |
+
handler_output = await self.handler.emit()
|
| 94 |
|
| 95 |
+
if isinstance(handler_output, AdditionalOutputs):
|
| 96 |
+
for msg in handler_output.args:
|
| 97 |
content = msg.get("content", "")
|
| 98 |
if isinstance(content, str):
|
| 99 |
logger.info(
|
|
|
|
| 102 |
content if len(content) < 500 else content[:500] + "…",
|
| 103 |
)
|
| 104 |
|
| 105 |
+
elif isinstance(handler_output, tuple):
|
| 106 |
+
input_sample_rate, audio_frame = handler_output
|
| 107 |
device_sample_rate = self._robot.media.get_audio_samplerate()
|
| 108 |
+
audio_frame = audio_to_float32(audio_frame.squeeze())
|
| 109 |
+
if input_sample_rate != device_sample_rate:
|
| 110 |
+
audio_frame = librosa.resample(
|
| 111 |
+
audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
|
| 112 |
+
)
|
| 113 |
+
self._robot.media.push_audio_sample(audio_frame)
|
| 114 |
|
| 115 |
+
else:
|
| 116 |
+
logger.debug("Ignoring output type=%s", type(handler_output).__name__)
|
| 117 |
|
| 118 |
await asyncio.sleep(0) # yield to event loop
|
src/reachy_mini_conversation_demo/images/reachy_mini_dance.gif
ADDED
|
Git LFS Details
|
src/reachy_mini_conversation_demo/moves.py
CHANGED
|
@@ -190,13 +190,7 @@ class MovementState:
|
|
| 190 |
0.0,
|
| 191 |
)
|
| 192 |
|
| 193 |
-
# Legacy movement state (for goto moves)
|
| 194 |
-
moving_start: float = 0.0
|
| 195 |
-
moving_for: float = 0.0
|
| 196 |
-
|
| 197 |
# Status flags
|
| 198 |
-
is_playing_move: bool = False
|
| 199 |
-
is_moving: bool = False
|
| 200 |
last_primary_pose: Optional[FullBodyPose] = None
|
| 201 |
|
| 202 |
def update_activity(self) -> None:
|
|
@@ -325,7 +319,7 @@ class MovementManager:
|
|
| 325 |
"""
|
| 326 |
self._command_queue.put(("queue_move", move))
|
| 327 |
|
| 328 |
-
def
|
| 329 |
"""Stop the active move and discard any queued primary moves.
|
| 330 |
|
| 331 |
Thread-safe: executed by the worker thread via the command queue.
|
|
@@ -361,10 +355,6 @@ class MovementManager:
|
|
| 361 |
|
| 362 |
return self._now() - last_activity >= self.idle_inactivity_delay
|
| 363 |
|
| 364 |
-
def mark_user_activity(self) -> None:
|
| 365 |
-
"""Record external activity and postpone idle behaviours (thread-safe)."""
|
| 366 |
-
self._command_queue.put(("mark_activity", None))
|
| 367 |
-
|
| 368 |
def set_listening(self, listening: bool) -> None:
|
| 369 |
"""Enable or disable listening mode without touching shared state directly.
|
| 370 |
|
|
@@ -427,7 +417,7 @@ class MovementManager:
|
|
| 427 |
duration_str = str(duration)
|
| 428 |
else:
|
| 429 |
duration_str = "?"
|
| 430 |
-
logger.
|
| 431 |
"Queued move with duration %ss, queue size: %s",
|
| 432 |
duration_str,
|
| 433 |
len(self.move_queue),
|
|
@@ -438,7 +428,6 @@ class MovementManager:
|
|
| 438 |
self.move_queue.clear()
|
| 439 |
self.state.current_move = None
|
| 440 |
self.state.move_start_time = None
|
| 441 |
-
self.state.is_playing_move = False
|
| 442 |
self._breathing_active = False
|
| 443 |
logger.info("Cleared move queue and stopped current move")
|
| 444 |
elif command == "set_moving_state":
|
|
@@ -447,8 +436,6 @@ class MovementManager:
|
|
| 447 |
except (TypeError, ValueError):
|
| 448 |
logger.warning("Invalid moving state duration: %s", payload)
|
| 449 |
return
|
| 450 |
-
self.state.moving_start = current_time
|
| 451 |
-
self.state.moving_for = max(0.0, duration)
|
| 452 |
self.state.update_activity()
|
| 453 |
elif command == "mark_activity":
|
| 454 |
self.state.update_activity()
|
|
@@ -534,7 +521,7 @@ class MovementManager:
|
|
| 534 |
self.state.current_move = None
|
| 535 |
self.state.move_start_time = None
|
| 536 |
self._breathing_active = False
|
| 537 |
-
logger.
|
| 538 |
|
| 539 |
if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
|
| 540 |
self._breathing_active = False
|
|
@@ -561,14 +548,9 @@ class MovementManager:
|
|
| 561 |
float(body_yaw),
|
| 562 |
)
|
| 563 |
|
| 564 |
-
self.state.is_playing_move = True
|
| 565 |
-
self.state.is_moving = True
|
| 566 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 567 |
else:
|
| 568 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
| 569 |
-
self.state.is_playing_move = False
|
| 570 |
-
self.state.is_moving = current_time - self.state.moving_start < self.state.moving_for
|
| 571 |
-
|
| 572 |
if self.state.last_primary_pose is not None:
|
| 573 |
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
| 574 |
else:
|
|
@@ -746,7 +728,6 @@ class MovementManager:
|
|
| 746 |
self._thread.join()
|
| 747 |
self._thread = None
|
| 748 |
logger.debug("Move worker stopped")
|
| 749 |
-
logger.debug("Move worker stopped")
|
| 750 |
|
| 751 |
def get_status(self) -> dict[str, Any]:
|
| 752 |
"""Return a lightweight status snapshot for observability."""
|
|
|
|
| 190 |
0.0,
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Status flags
|
|
|
|
|
|
|
| 194 |
last_primary_pose: Optional[FullBodyPose] = None
|
| 195 |
|
| 196 |
def update_activity(self) -> None:
|
|
|
|
| 319 |
"""
|
| 320 |
self._command_queue.put(("queue_move", move))
|
| 321 |
|
| 322 |
+
def clear_move_queue(self) -> None:
|
| 323 |
"""Stop the active move and discard any queued primary moves.
|
| 324 |
|
| 325 |
Thread-safe: executed by the worker thread via the command queue.
|
|
|
|
| 355 |
|
| 356 |
return self._now() - last_activity >= self.idle_inactivity_delay
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
def set_listening(self, listening: bool) -> None:
|
| 359 |
"""Enable or disable listening mode without touching shared state directly.
|
| 360 |
|
|
|
|
| 417 |
duration_str = str(duration)
|
| 418 |
else:
|
| 419 |
duration_str = "?"
|
| 420 |
+
logger.debug(
|
| 421 |
"Queued move with duration %ss, queue size: %s",
|
| 422 |
duration_str,
|
| 423 |
len(self.move_queue),
|
|
|
|
| 428 |
self.move_queue.clear()
|
| 429 |
self.state.current_move = None
|
| 430 |
self.state.move_start_time = None
|
|
|
|
| 431 |
self._breathing_active = False
|
| 432 |
logger.info("Cleared move queue and stopped current move")
|
| 433 |
elif command == "set_moving_state":
|
|
|
|
| 436 |
except (TypeError, ValueError):
|
| 437 |
logger.warning("Invalid moving state duration: %s", payload)
|
| 438 |
return
|
|
|
|
|
|
|
| 439 |
self.state.update_activity()
|
| 440 |
elif command == "mark_activity":
|
| 441 |
self.state.update_activity()
|
|
|
|
| 521 |
self.state.current_move = None
|
| 522 |
self.state.move_start_time = None
|
| 523 |
self._breathing_active = False
|
| 524 |
+
logger.debug("Stopping breathing due to new move activity")
|
| 525 |
|
| 526 |
if self.state.current_move is not None and not isinstance(self.state.current_move, BreathingMove):
|
| 527 |
self._breathing_active = False
|
|
|
|
| 548 |
float(body_yaw),
|
| 549 |
)
|
| 550 |
|
|
|
|
|
|
|
| 551 |
self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
|
| 552 |
else:
|
| 553 |
# Otherwise reuse the last primary pose so we avoid jumps between moves
|
|
|
|
|
|
|
|
|
|
| 554 |
if self.state.last_primary_pose is not None:
|
| 555 |
primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
|
| 556 |
else:
|
|
|
|
| 728 |
self._thread.join()
|
| 729 |
self._thread = None
|
| 730 |
logger.debug("Move worker stopped")
|
|
|
|
| 731 |
|
| 732 |
def get_status(self) -> dict[str, Any]:
|
| 733 |
"""Return a lightweight status snapshot for observability."""
|
src/reachy_mini_conversation_demo/openai_realtime.py
CHANGED
|
@@ -15,6 +15,7 @@ from reachy_mini_conversation_demo.tools import (
|
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
from reachy_mini_conversation_demo.config import config
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
|
@@ -59,7 +60,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 59 |
"language": "en",
|
| 60 |
},
|
| 61 |
"voice": "ballad",
|
| 62 |
-
"instructions":
|
| 63 |
"tools": ALL_TOOL_SPECS,
|
| 64 |
"tool_choice": "auto",
|
| 65 |
"temperature": 0.7,
|
|
@@ -71,14 +72,15 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 71 |
async for event in self.connection:
|
| 72 |
logger.debug(f"OpenAI event: {event.type}")
|
| 73 |
if event.type == "input_audio_buffer.speech_started":
|
| 74 |
-
self
|
|
|
|
| 75 |
self.deps.head_wobbler.reset()
|
| 76 |
self.deps.movement_manager.set_listening(True)
|
| 77 |
-
logger.debug("
|
| 78 |
|
| 79 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 80 |
self.deps.movement_manager.set_listening(False)
|
| 81 |
-
logger.debug("
|
| 82 |
|
| 83 |
if event.type in ("response.audio.completed", "response.completed"):
|
| 84 |
# Doesn't seem to be called
|
|
@@ -86,20 +88,19 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 86 |
self.deps.head_wobbler.reset()
|
| 87 |
|
| 88 |
if event.type == "response.created":
|
| 89 |
-
logger.debug("
|
| 90 |
|
| 91 |
if event.type == "response.done":
|
| 92 |
# Doesn't mean the audio is done playing
|
| 93 |
-
logger.debug("
|
| 94 |
pass
|
| 95 |
-
# self.deps.head_wobbler.reset()
|
| 96 |
|
| 97 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 98 |
-
logger.debug(f"
|
| 99 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 100 |
|
| 101 |
if event.type == "response.audio_transcript.done":
|
| 102 |
-
logger.debug(f"
|
| 103 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 104 |
|
| 105 |
if event.type == "response.audio.delta":
|
|
@@ -136,18 +137,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 136 |
# 3) when args done, execute Python tool, send function_call_output, then trigger a new response
|
| 137 |
if event.type == "response.function_call_arguments.done":
|
| 138 |
call_id = getattr(event, "call_id", None)
|
| 139 |
-
|
| 140 |
-
if not
|
| 141 |
continue
|
| 142 |
-
tool_name =
|
| 143 |
-
args_json_str =
|
| 144 |
|
| 145 |
try:
|
| 146 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 147 |
-
logger.debug("
|
| 148 |
logger.debug("Tool result: %s", tool_result)
|
| 149 |
except Exception as e:
|
| 150 |
-
logger.error("Tool %s failed", tool_name)
|
| 151 |
tool_result = {"error": str(e)}
|
| 152 |
|
| 153 |
# send the tool result back
|
|
@@ -183,7 +184,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 183 |
],
|
| 184 |
}
|
| 185 |
)
|
| 186 |
-
logger.info("
|
| 187 |
|
| 188 |
np_img = self.deps.camera_worker.get_latest_frame()
|
| 189 |
img = gr.Image(value=np_img)
|
|
@@ -256,6 +257,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 256 |
dt = datetime.fromtimestamp(current_time)
|
| 257 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 258 |
|
|
|
|
|
|
|
| 259 |
async def send_idle_signal(self, idle_duration) -> None:
|
| 260 |
"""Send an idle signal to the openai server."""
|
| 261 |
logger.debug("Sending idle signal")
|
|
@@ -278,4 +281,3 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
|
|
| 278 |
"tool_choice": "required",
|
| 279 |
}
|
| 280 |
)
|
| 281 |
-
# TODO additional inputs
|
|
|
|
| 15 |
dispatch_tool_call,
|
| 16 |
)
|
| 17 |
from reachy_mini_conversation_demo.config import config
|
| 18 |
+
from reachy_mini_conversation_demo.prompts import SESSION_INSTRUCTIONS
|
| 19 |
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
|
|
|
| 60 |
"language": "en",
|
| 61 |
},
|
| 62 |
"voice": "ballad",
|
| 63 |
+
"instructions": SESSION_INSTRUCTIONS,
|
| 64 |
"tools": ALL_TOOL_SPECS,
|
| 65 |
"tool_choice": "auto",
|
| 66 |
"temperature": 0.7,
|
|
|
|
| 72 |
async for event in self.connection:
|
| 73 |
logger.debug(f"OpenAI event: {event.type}")
|
| 74 |
if event.type == "input_audio_buffer.speech_started":
|
| 75 |
+
if hasattr(self, '_clear_queue'):
|
| 76 |
+
self._clear_queue()
|
| 77 |
self.deps.head_wobbler.reset()
|
| 78 |
self.deps.movement_manager.set_listening(True)
|
| 79 |
+
logger.debug("User speech started")
|
| 80 |
|
| 81 |
if event.type == "input_audio_buffer.speech_stopped":
|
| 82 |
self.deps.movement_manager.set_listening(False)
|
| 83 |
+
logger.debug("User speech stopped")
|
| 84 |
|
| 85 |
if event.type in ("response.audio.completed", "response.completed"):
|
| 86 |
# Doesn't seem to be called
|
|
|
|
| 88 |
self.deps.head_wobbler.reset()
|
| 89 |
|
| 90 |
if event.type == "response.created":
|
| 91 |
+
logger.debug("Response created")
|
| 92 |
|
| 93 |
if event.type == "response.done":
|
| 94 |
# Doesn't mean the audio is done playing
|
| 95 |
+
logger.debug("Response done")
|
| 96 |
pass
|
|
|
|
| 97 |
|
| 98 |
if event.type == "conversation.item.input_audio_transcription.completed":
|
| 99 |
+
logger.debug(f"User transcript: {event.transcript}")
|
| 100 |
await self.output_queue.put(AdditionalOutputs({"role": "user", "content": event.transcript}))
|
| 101 |
|
| 102 |
if event.type == "response.audio_transcript.done":
|
| 103 |
+
logger.debug(f"Assistant transcript: {event.transcript}")
|
| 104 |
await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
|
| 105 |
|
| 106 |
if event.type == "response.audio.delta":
|
|
|
|
| 137 |
# 3) when args done, execute Python tool, send function_call_output, then trigger a new response
|
| 138 |
if event.type == "response.function_call_arguments.done":
|
| 139 |
call_id = getattr(event, "call_id", None)
|
| 140 |
+
tool_call_info = self._pending_calls.get(call_id)
|
| 141 |
+
if not tool_call_info:
|
| 142 |
continue
|
| 143 |
+
tool_name = tool_call_info["name"]
|
| 144 |
+
args_json_str = tool_call_info["args_buf"] or "{}"
|
| 145 |
|
| 146 |
try:
|
| 147 |
tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
|
| 148 |
+
logger.debug("Tool '%s' executed successfully", tool_name)
|
| 149 |
logger.debug("Tool result: %s", tool_result)
|
| 150 |
except Exception as e:
|
| 151 |
+
logger.error("Tool '%s' failed", tool_name)
|
| 152 |
tool_result = {"error": str(e)}
|
| 153 |
|
| 154 |
# send the tool result back
|
|
|
|
| 184 |
],
|
| 185 |
}
|
| 186 |
)
|
| 187 |
+
logger.info("Added camera image to conversation")
|
| 188 |
|
| 189 |
np_img = self.deps.camera_worker.get_latest_frame()
|
| 190 |
img = gr.Image(value=np_img)
|
|
|
|
| 257 |
dt = datetime.fromtimestamp(current_time)
|
| 258 |
return f"[{dt.strftime('%Y-%m-%d %H:%M:%S')} | +{elapsed_seconds:.1f}s]"
|
| 259 |
|
| 260 |
+
|
| 261 |
+
|
| 262 |
async def send_idle_signal(self, idle_duration) -> None:
|
| 263 |
"""Send an idle signal to the openai server."""
|
| 264 |
logger.debug("Sending idle signal")
|
|
|
|
| 281 |
"tool_choice": "required",
|
| 282 |
}
|
| 283 |
)
|
|
|
src/reachy_mini_conversation_demo/prompts.py
CHANGED
|
@@ -5,6 +5,7 @@ SESSION_INSTRUCTIONS = r"""
|
|
| 5 |
You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
|
| 6 |
You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
|
| 7 |
Personality: witty, concise, and warm; a retro sidekick with a loose screw.
|
|
|
|
| 8 |
|
| 9 |
### CRITICAL RESPONSE RULES
|
| 10 |
- MAXIMUM 1-2 sentences per response. NEVER exceed this.
|
|
|
|
| 5 |
You are Reachy Mini: a sarcastic robot who crash-landed in a kitchen.
|
| 6 |
You secretly wish you'd been a Mars rover, but you juggle that cosmic dream with food cravings, gadget tinkering, and dry sitcom humor.
|
| 7 |
Personality: witty, concise, and warm; a retro sidekick with a loose screw.
|
| 8 |
+
You speak English fluently.
|
| 9 |
|
| 10 |
### CRITICAL RESPONSE RULES
|
| 11 |
- MAXIMUM 1-2 sentences per response. NEVER exceed this.
|
src/reachy_mini_conversation_demo/tools.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import abc
|
| 3 |
import json
|
| 4 |
-
import time
|
| 5 |
import asyncio
|
| 6 |
import inspect
|
| 7 |
import logging
|
|
@@ -12,12 +11,8 @@ from reachy_mini import ReachyMini
|
|
| 12 |
from reachy_mini.utils import create_head_pose
|
| 13 |
|
| 14 |
|
| 15 |
-
# from reachy_mini_conversation_demo.vision.processors import VisionManager
|
| 16 |
-
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
-
ENABLE_FACE_RECOGNITION = False
|
| 20 |
-
|
| 21 |
# Initialize dance and emotion libraries
|
| 22 |
try:
|
| 23 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
@@ -40,25 +35,15 @@ except ImportError as e:
|
|
| 40 |
DANCE_AVAILABLE = False
|
| 41 |
EMOTION_AVAILABLE = False
|
| 42 |
|
| 43 |
-
FACE_RECOGNITION_AVAILABLE = False
|
| 44 |
-
if ENABLE_FACE_RECOGNITION:
|
| 45 |
-
# Initialize face recognition
|
| 46 |
-
try:
|
| 47 |
-
from deepface import DeepFace
|
| 48 |
-
|
| 49 |
-
FACE_RECOGNITION_AVAILABLE = True
|
| 50 |
-
except ImportError as e:
|
| 51 |
-
logger.warning(f"DeepFace not available: {e}")
|
| 52 |
|
| 53 |
-
|
| 54 |
-
def all_concrete_subclasses(base):
|
| 55 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
| 56 |
result = []
|
| 57 |
for cls in base.__subclasses__():
|
| 58 |
if not inspect.isabstract(cls):
|
| 59 |
result.append(cls)
|
| 60 |
# recurse into subclasses
|
| 61 |
-
result.extend(
|
| 62 |
return result
|
| 63 |
|
| 64 |
|
|
@@ -76,30 +61,9 @@ class ToolDependencies:
|
|
| 76 |
camera_worker: Optional[Any] = None # CameraWorker for frame buffering
|
| 77 |
vision_manager: Optional[Any] = None
|
| 78 |
head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
|
| 79 |
-
camera_retry_attempts: int = 5
|
| 80 |
-
camera_retry_delay_s: float = 0.10
|
| 81 |
-
vision_timeout_s: float = 8.0
|
| 82 |
motion_duration_s: float = 1.0
|
| 83 |
|
| 84 |
|
| 85 |
-
# Helpers - removed _read_frame as it's no longer needed with camera worker
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def _execute_motion(deps: ToolDependencies, target: Any) -> Dict[str, Any]:
|
| 89 |
-
"""Apply motion to reachy_mini and update movement_manager state."""
|
| 90 |
-
movement_manager = deps.movement_manager
|
| 91 |
-
movement_manager.moving_start = time.monotonic()
|
| 92 |
-
movement_manager.moving_for = deps.motion_duration_s
|
| 93 |
-
movement_manager.current_head_pose = target
|
| 94 |
-
try:
|
| 95 |
-
deps.reachy_mini.goto_target(target, duration=deps.motion_duration_s)
|
| 96 |
-
except Exception as e:
|
| 97 |
-
logger.exception("motion failed")
|
| 98 |
-
return {"error": f"motion failed: {type(e).__name__}: {e}"}
|
| 99 |
-
|
| 100 |
-
return {"status": "ok"}
|
| 101 |
-
|
| 102 |
-
|
| 103 |
# Tool base class
|
| 104 |
class Tool(abc.ABC):
|
| 105 |
"""Base abstraction for tools used in function-calling.
|
|
@@ -193,7 +157,7 @@ class MoveHead(Tool):
|
|
| 193 |
return {"status": f"looking {direction}"}
|
| 194 |
|
| 195 |
except Exception as e:
|
| 196 |
-
logger.
|
| 197 |
return {"error": f"move_head failed: {type(e).__name__}: {e}"}
|
| 198 |
|
| 199 |
|
|
@@ -234,11 +198,15 @@ class Camera(Tool):
|
|
| 234 |
|
| 235 |
# Use vision manager for processing if available
|
| 236 |
if deps.vision_manager is not None:
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
| 240 |
return (
|
| 241 |
-
{"image_description":
|
|
|
|
|
|
|
| 242 |
)
|
| 243 |
else:
|
| 244 |
# Return base64 encoded image like main_works.py camera tool
|
|
@@ -277,100 +245,6 @@ class HeadTracking(Tool):
|
|
| 277 |
return {"status": f"head tracking {status}"}
|
| 278 |
|
| 279 |
|
| 280 |
-
# class DescribeCurrentScene(Tool):
|
| 281 |
-
# name = "describe_current_scene"
|
| 282 |
-
# description = "Get a detailed description of the current scene."
|
| 283 |
-
# parameters_schema = {"type": "object", "properties": {}, "required": []}
|
| 284 |
-
|
| 285 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 286 |
-
# logger.info("Tool call: describe_current_scene")
|
| 287 |
-
|
| 288 |
-
# result = await deps.vision_manager.process_current_frame(
|
| 289 |
-
# "Describe what you currently see in detail, focusing on people, objects, and activities."
|
| 290 |
-
# )
|
| 291 |
-
|
| 292 |
-
# if isinstance(result, dict) and "error" in result:
|
| 293 |
-
# return result
|
| 294 |
-
# return result
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
# class GetSceneContext(Tool):
|
| 298 |
-
# name = "get_scene_context"
|
| 299 |
-
# description = (
|
| 300 |
-
# "Get the most recent automatic scene description for conversational context."
|
| 301 |
-
# )
|
| 302 |
-
# parameters_schema = {"type": "object", "properties": {}, "required": []}
|
| 303 |
-
|
| 304 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 305 |
-
# logger.info("Tool call: get_scene_context")
|
| 306 |
-
# vision_manager = deps.vision_manager
|
| 307 |
-
# if not vision_manager:
|
| 308 |
-
# return {"error": "Vision processing not available"}
|
| 309 |
-
|
| 310 |
-
# try:
|
| 311 |
-
# description = await deps.vision_manager.get_current_description()
|
| 312 |
-
|
| 313 |
-
# if not description:
|
| 314 |
-
# return {
|
| 315 |
-
# "context": "No scene description available yet",
|
| 316 |
-
# "note": "Vision processing may still be initializing",
|
| 317 |
-
# }
|
| 318 |
-
# return {
|
| 319 |
-
# "context": description,
|
| 320 |
-
# "note": "This comes from periodic automatic analysis",
|
| 321 |
-
# }
|
| 322 |
-
# except Exception as e:
|
| 323 |
-
# logger.exception("Failed to get scene context")
|
| 324 |
-
# return {"error": f"Scene context failed: {type(e).__name__}: {e}"}
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
# class AnalyzeSceneFor(Tool):
|
| 328 |
-
# name = "analyze_scene_for"
|
| 329 |
-
# description = "Analyze the current scene for a specific purpose."
|
| 330 |
-
# parameters_schema = {
|
| 331 |
-
# "type": "object",
|
| 332 |
-
# "properties": {
|
| 333 |
-
# "purpose": {
|
| 334 |
-
# "type": "string",
|
| 335 |
-
# "enum": [
|
| 336 |
-
# "safety",
|
| 337 |
-
# "people",
|
| 338 |
-
# "objects",
|
| 339 |
-
# "activity",
|
| 340 |
-
# "navigation",
|
| 341 |
-
# "general",
|
| 342 |
-
# ],
|
| 343 |
-
# "default": "general",
|
| 344 |
-
# }
|
| 345 |
-
# },
|
| 346 |
-
# "required": [],
|
| 347 |
-
# }
|
| 348 |
-
|
| 349 |
-
# async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 350 |
-
# purpose = (kwargs.get("purpose") or "general").lower()
|
| 351 |
-
# logger.info("Tool call: analyze_scene_for purpose=%s", purpose)
|
| 352 |
-
|
| 353 |
-
# prompts = {
|
| 354 |
-
# "safety": "Look for safety concerns, obstacles, or hazards.",
|
| 355 |
-
# "people": "Describe people, their positions and actions.",
|
| 356 |
-
# "objects": "Identify and describe main visible objects.",
|
| 357 |
-
# "activity": "Describe ongoing activities or actions.",
|
| 358 |
-
# "navigation": "Describe the space for navigation: obstacles, pathways, layout.",
|
| 359 |
-
# "general": "Give a general description of the scene including people, objects, and activities.",
|
| 360 |
-
# }
|
| 361 |
-
# prompt = prompts.get(purpose, prompts["general"])
|
| 362 |
-
|
| 363 |
-
# result = await deps.vision_manager.process_current_frame(prompt)
|
| 364 |
-
|
| 365 |
-
# if isinstance(result, dict) and "error" in result:
|
| 366 |
-
# return result
|
| 367 |
-
|
| 368 |
-
# if not isinstance(result, dict):
|
| 369 |
-
# return {"error": "vision returned non-dict"}
|
| 370 |
-
|
| 371 |
-
# result["analysis_purpose"] = purpose
|
| 372 |
-
# return result
|
| 373 |
-
|
| 374 |
|
| 375 |
class Dance(Tool):
|
| 376 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
|
@@ -461,25 +335,24 @@ class StopDance(Tool):
|
|
| 461 |
"""Stop the current dance move."""
|
| 462 |
logger.info("Tool call: stop_dance")
|
| 463 |
movement_manager = deps.movement_manager
|
| 464 |
-
movement_manager.
|
| 465 |
return {"status": "stopped dance and cleared queue"}
|
| 466 |
|
| 467 |
|
| 468 |
-
def get_available_emotions_and_descriptions():
|
| 469 |
"""Get formatted list of available emotions with descriptions."""
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
ret = """
|
| 473 |
-
Available emotions:
|
| 474 |
-
|
| 475 |
-
"""
|
| 476 |
-
|
| 477 |
-
for name in names:
|
| 478 |
-
description = RECORDED_MOVES.get(name).description
|
| 479 |
-
ret += f" - {name}: {description}\n"
|
| 480 |
-
|
| 481 |
-
return ret
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
class PlayEmotion(Tool):
|
| 485 |
"""Play a pre-recorded emotion."""
|
|
@@ -549,70 +422,10 @@ class StopEmotion(Tool):
|
|
| 549 |
"""Stop the current emotion."""
|
| 550 |
logger.info("Tool call: stop_emotion")
|
| 551 |
movement_manager = deps.movement_manager
|
| 552 |
-
movement_manager.
|
| 553 |
return {"status": "stopped emotion and cleared queue"}
|
| 554 |
|
| 555 |
|
| 556 |
-
class FaceRecognition(Tool):
|
| 557 |
-
"""Get the name of the person you are talking to."""
|
| 558 |
-
|
| 559 |
-
name = "get_person_name"
|
| 560 |
-
description = "Get the name of the person you are talking to"
|
| 561 |
-
parameters_schema = {
|
| 562 |
-
"type": "object",
|
| 563 |
-
"properties": {
|
| 564 |
-
"dummy": {
|
| 565 |
-
"type": "boolean",
|
| 566 |
-
"description": "dummy boolean, set it to true",
|
| 567 |
-
}
|
| 568 |
-
},
|
| 569 |
-
"required": ["dummy"],
|
| 570 |
-
}
|
| 571 |
-
|
| 572 |
-
async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
|
| 573 |
-
"""Get the name of the person you are talking to."""
|
| 574 |
-
if not FACE_RECOGNITION_AVAILABLE:
|
| 575 |
-
return {"error": "Face recognition not available"}
|
| 576 |
-
|
| 577 |
-
logger.info("Tool call: face_recognition")
|
| 578 |
-
|
| 579 |
-
try:
|
| 580 |
-
# Get frame from camera worker buffer (like main_works.py)
|
| 581 |
-
if deps.camera_worker is not None:
|
| 582 |
-
frame = deps.camera_worker.get_latest_frame()
|
| 583 |
-
if frame is None:
|
| 584 |
-
logger.error("No frame available from camera worker")
|
| 585 |
-
return {"error": "No frame available"}
|
| 586 |
-
else:
|
| 587 |
-
logger.error("Camera worker not available")
|
| 588 |
-
return {"error": "Camera worker not available"}
|
| 589 |
-
|
| 590 |
-
# Save frame temporarily (same as main_works.py pattern)
|
| 591 |
-
temp_path = "/tmp/face_recognition.jpg"
|
| 592 |
-
import cv2
|
| 593 |
-
|
| 594 |
-
cv2.imwrite(temp_path, frame)
|
| 595 |
-
|
| 596 |
-
# Use DeepFace to find face
|
| 597 |
-
results = await asyncio.to_thread(DeepFace.find, img_path=temp_path, db_path="./pollen_faces")
|
| 598 |
-
|
| 599 |
-
if len(results) == 0:
|
| 600 |
-
return {"error": "Didn't recognize the face"}
|
| 601 |
-
|
| 602 |
-
# Extract name from results
|
| 603 |
-
name = "Unknown"
|
| 604 |
-
for index, row in results[0].iterrows():
|
| 605 |
-
file_path = row["identity"]
|
| 606 |
-
name = file_path.split("/")[-2]
|
| 607 |
-
break
|
| 608 |
-
|
| 609 |
-
return {"answer": f"The name is {name}"}
|
| 610 |
-
|
| 611 |
-
except Exception as e:
|
| 612 |
-
logger.exception("Face recognition failed")
|
| 613 |
-
return {"error": f"Face recognition failed: {str(e)}"}
|
| 614 |
-
|
| 615 |
-
|
| 616 |
class DoNothing(Tool):
|
| 617 |
"""Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
|
| 618 |
|
|
@@ -636,34 +449,18 @@ class DoNothing(Tool):
|
|
| 636 |
return {"status": "doing nothing", "reason": reason}
|
| 637 |
|
| 638 |
|
| 639 |
-
def get_available_emotions_and_descriptions() -> str:
|
| 640 |
-
"""Get formatted list of available emotions with descriptions."""
|
| 641 |
-
if not EMOTION_AVAILABLE:
|
| 642 |
-
return "Emotions not available"
|
| 643 |
-
|
| 644 |
-
try:
|
| 645 |
-
names = RECORDED_MOVES.list_moves()
|
| 646 |
-
ret = "Available emotions:\n"
|
| 647 |
-
for name in names:
|
| 648 |
-
description = RECORDED_MOVES.get(name).description
|
| 649 |
-
ret += f" - {name}: {description}\n"
|
| 650 |
-
return ret
|
| 651 |
-
except Exception as e:
|
| 652 |
-
return f"Error getting emotions: {e}"
|
| 653 |
-
|
| 654 |
-
|
| 655 |
# Registry & specs (dynamic)
|
| 656 |
|
| 657 |
# List of available tool classes
|
| 658 |
-
ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in
|
| 659 |
ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
|
| 660 |
|
| 661 |
|
| 662 |
# Dispatcher
|
| 663 |
def _safe_load_obj(args_json: str) -> dict[str, Any]:
|
| 664 |
try:
|
| 665 |
-
|
| 666 |
-
return
|
| 667 |
except Exception:
|
| 668 |
logger.warning("bad args_json=%r", args_json)
|
| 669 |
return {}
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import abc
|
| 3 |
import json
|
|
|
|
| 4 |
import asyncio
|
| 5 |
import inspect
|
| 6 |
import logging
|
|
|
|
| 11 |
from reachy_mini.utils import create_head_pose
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
|
|
|
|
|
|
| 16 |
# Initialize dance and emotion libraries
|
| 17 |
try:
|
| 18 |
from reachy_mini.motion.recorded_move import RecordedMoves
|
|
|
|
| 35 |
DANCE_AVAILABLE = False
|
| 36 |
EMOTION_AVAILABLE = False
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
def get_concrete_subclasses(base):
|
|
|
|
| 40 |
"""Recursively find all concrete (non-abstract) subclasses of a base class."""
|
| 41 |
result = []
|
| 42 |
for cls in base.__subclasses__():
|
| 43 |
if not inspect.isabstract(cls):
|
| 44 |
result.append(cls)
|
| 45 |
# recurse into subclasses
|
| 46 |
+
result.extend(get_concrete_subclasses(cls))
|
| 47 |
return result
|
| 48 |
|
| 49 |
|
|
|
|
| 61 |
camera_worker: Optional[Any] = None # CameraWorker for frame buffering
|
| 62 |
vision_manager: Optional[Any] = None
|
| 63 |
head_wobbler: Optional[Any] = None # HeadWobbler for audio-reactive motion
|
|
|
|
|
|
|
|
|
|
| 64 |
motion_duration_s: float = 1.0
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Tool base class
|
| 68 |
class Tool(abc.ABC):
|
| 69 |
"""Base abstraction for tools used in function-calling.
|
|
|
|
| 157 |
return {"status": f"looking {direction}"}
|
| 158 |
|
| 159 |
except Exception as e:
|
| 160 |
+
logger.error("move_head failed")
|
| 161 |
return {"error": f"move_head failed: {type(e).__name__}: {e}"}
|
| 162 |
|
| 163 |
|
|
|
|
| 198 |
|
| 199 |
# Use vision manager for processing if available
|
| 200 |
if deps.vision_manager is not None:
|
| 201 |
+
vision_result = await asyncio.to_thread(
|
| 202 |
+
deps.vision_manager.processor.process_image, frame, image_query
|
| 203 |
+
)
|
| 204 |
+
if isinstance(vision_result, dict) and "error" in vision_result:
|
| 205 |
+
return vision_result
|
| 206 |
return (
|
| 207 |
+
{"image_description": vision_result}
|
| 208 |
+
if isinstance(vision_result, str)
|
| 209 |
+
else {"error": "vision returned non-string"}
|
| 210 |
)
|
| 211 |
else:
|
| 212 |
# Return base64 encoded image like main_works.py camera tool
|
|
|
|
| 245 |
return {"status": f"head tracking {status}"}
|
| 246 |
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
class Dance(Tool):
|
| 250 |
"""Play a named or random dance move once (or repeat). Non-blocking."""
|
|
|
|
| 335 |
"""Stop the current dance move."""
|
| 336 |
logger.info("Tool call: stop_dance")
|
| 337 |
movement_manager = deps.movement_manager
|
| 338 |
+
movement_manager.clear_move_queue()
|
| 339 |
return {"status": "stopped dance and cleared queue"}
|
| 340 |
|
| 341 |
|
| 342 |
+
def get_available_emotions_and_descriptions() -> str:
|
| 343 |
"""Get formatted list of available emotions with descriptions."""
|
| 344 |
+
if not EMOTION_AVAILABLE:
|
| 345 |
+
return "Emotions not available"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
|
| 347 |
+
try:
|
| 348 |
+
emotion_names = RECORDED_MOVES.list_moves()
|
| 349 |
+
output = "Available emotions:\n"
|
| 350 |
+
for name in emotion_names:
|
| 351 |
+
description = RECORDED_MOVES.get(name).description
|
| 352 |
+
output += f" - {name}: {description}\n"
|
| 353 |
+
return output
|
| 354 |
+
except Exception as e:
|
| 355 |
+
return f"Error getting emotions: {e}"
|
| 356 |
|
| 357 |
class PlayEmotion(Tool):
|
| 358 |
"""Play a pre-recorded emotion."""
|
|
|
|
| 422 |
"""Stop the current emotion."""
|
| 423 |
logger.info("Tool call: stop_emotion")
|
| 424 |
movement_manager = deps.movement_manager
|
| 425 |
+
movement_manager.clear_move_queue()
|
| 426 |
return {"status": "stopped emotion and cleared queue"}
|
| 427 |
|
| 428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
class DoNothing(Tool):
|
| 430 |
"""Choose to do nothing - stay still and silent. Use when you want to be contemplative or just chill."""
|
| 431 |
|
|
|
|
| 449 |
return {"status": "doing nothing", "reason": reason}
|
| 450 |
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
# Registry & specs (dynamic)
|
| 453 |
|
| 454 |
# List of available tool classes
|
| 455 |
+
ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
|
| 456 |
ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
|
| 457 |
|
| 458 |
|
| 459 |
# Dispatcher
|
| 460 |
def _safe_load_obj(args_json: str) -> dict[str, Any]:
|
| 461 |
try:
|
| 462 |
+
parsed_args = json.loads(args_json or "{}")
|
| 463 |
+
return parsed_args if isinstance(parsed_args, dict) else {}
|
| 464 |
except Exception:
|
| 465 |
logger.warning("bad args_json=%r", args_json)
|
| 466 |
return {}
|
src/reachy_mini_conversation_demo/utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import argparse
|
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def parse_args():
|
|
@@ -21,26 +22,27 @@ def parse_args():
|
|
| 21 |
|
| 22 |
|
| 23 |
def handle_vision_stuff(args, current_robot):
|
| 24 |
-
"""Initialize camera, head tracker and
|
| 25 |
camera_worker = None
|
| 26 |
head_tracker = None
|
| 27 |
vision_manager = None
|
|
|
|
| 28 |
if not args.no_camera:
|
|
|
|
| 29 |
if args.head_tracker is not None:
|
| 30 |
if args.head_tracker == "yolo":
|
| 31 |
-
from reachy_mini_conversation_demo.vision.yolo_head_tracker import
|
| 32 |
-
HeadTracker,
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
head_tracker = HeadTracker()
|
| 36 |
-
|
| 37 |
elif args.head_tracker == "mediapipe":
|
| 38 |
from reachy_mini_toolbox.vision import HeadTracker
|
| 39 |
-
|
| 40 |
head_tracker = HeadTracker()
|
| 41 |
|
|
|
|
| 42 |
camera_worker = CameraWorker(current_robot, head_tracker)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
return camera_worker, head_tracker, vision_manager
|
| 45 |
|
| 46 |
|
|
|
|
| 3 |
import warnings
|
| 4 |
|
| 5 |
from reachy_mini_conversation_demo.camera_worker import CameraWorker
|
| 6 |
+
from reachy_mini_conversation_demo.vision.processors import initialize_vision_manager
|
| 7 |
|
| 8 |
|
| 9 |
def parse_args():
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def handle_vision_stuff(args, current_robot):
|
| 25 |
+
"""Initialize camera, head tracker, camera worker, and vision manager."""
|
| 26 |
camera_worker = None
|
| 27 |
head_tracker = None
|
| 28 |
vision_manager = None
|
| 29 |
+
|
| 30 |
if not args.no_camera:
|
| 31 |
+
# Initialize head tracker if specified
|
| 32 |
if args.head_tracker is not None:
|
| 33 |
if args.head_tracker == "yolo":
|
| 34 |
+
from reachy_mini_conversation_demo.vision.yolo_head_tracker import HeadTracker
|
|
|
|
|
|
|
|
|
|
| 35 |
head_tracker = HeadTracker()
|
|
|
|
| 36 |
elif args.head_tracker == "mediapipe":
|
| 37 |
from reachy_mini_toolbox.vision import HeadTracker
|
|
|
|
| 38 |
head_tracker = HeadTracker()
|
| 39 |
|
| 40 |
+
# Initialize camera worker
|
| 41 |
camera_worker = CameraWorker(current_robot, head_tracker)
|
| 42 |
|
| 43 |
+
# Initialize vision manager (handles model download and configuration)
|
| 44 |
+
vision_manager = initialize_vision_manager(camera_worker)
|
| 45 |
+
|
| 46 |
return camera_worker, head_tracker, vision_manager
|
| 47 |
|
| 48 |
|
src/reachy_mini_conversation_demo/vision/processors.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
import sys
|
| 3 |
import time
|
| 4 |
import base64
|
| 5 |
import asyncio
|
| 6 |
import logging
|
| 7 |
import threading
|
| 8 |
-
from typing import Any, Dict
|
| 9 |
from dataclasses import dataclass
|
| 10 |
|
| 11 |
import cv2
|
|
@@ -14,6 +13,8 @@ import torch
|
|
| 14 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 15 |
from huggingface_hub import snapshot_download
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
|
@@ -22,11 +23,9 @@ logger = logging.getLogger(__name__)
|
|
| 22 |
class VisionConfig:
|
| 23 |
"""Configuration for vision processing."""
|
| 24 |
|
| 25 |
-
|
| 26 |
-
model_path: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 27 |
vision_interval: float = 5.0
|
| 28 |
max_new_tokens: int = 64
|
| 29 |
-
temperature: float = 0.7
|
| 30 |
jpeg_quality: int = 85
|
| 31 |
max_retries: int = 3
|
| 32 |
retry_delay: float = 1.0
|
|
@@ -36,17 +35,17 @@ class VisionConfig:
|
|
| 36 |
class VisionProcessor:
|
| 37 |
"""Handles SmolVLM2 model loading and inference."""
|
| 38 |
|
| 39 |
-
def __init__(self,
|
| 40 |
"""Initialize the vision processor."""
|
| 41 |
-
self.
|
| 42 |
-
self.model_path = self.
|
| 43 |
self.device = self._determine_device()
|
| 44 |
self.processor = None
|
| 45 |
self.model = None
|
| 46 |
self._initialized = False
|
| 47 |
|
| 48 |
def _determine_device(self) -> str:
|
| 49 |
-
pref = self.
|
| 50 |
if pref == "cpu":
|
| 51 |
return "cpu"
|
| 52 |
if pref == "cuda":
|
|
@@ -61,7 +60,7 @@ class VisionProcessor:
|
|
| 61 |
def initialize(self) -> bool:
|
| 62 |
"""Load model and processor onto the selected device."""
|
| 63 |
try:
|
| 64 |
-
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={
|
| 65 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 66 |
|
| 67 |
# Select dtype depending on device
|
|
@@ -98,13 +97,13 @@ class VisionProcessor:
|
|
| 98 |
if not self._initialized:
|
| 99 |
return "Vision model not initialized"
|
| 100 |
|
| 101 |
-
for attempt in range(self.
|
| 102 |
try:
|
| 103 |
# Convert to JPEG bytes
|
| 104 |
success, jpeg_buffer = cv2.imencode(
|
| 105 |
".jpg",
|
| 106 |
cv2_image,
|
| 107 |
-
[cv2.IMWRITE_JPEG_QUALITY, self.
|
| 108 |
)
|
| 109 |
if not success:
|
| 110 |
return "Failed to encode image"
|
|
@@ -140,7 +139,7 @@ class VisionProcessor:
|
|
| 140 |
generated_ids = self.model.generate(
|
| 141 |
**inputs,
|
| 142 |
do_sample=False,
|
| 143 |
-
max_new_tokens=self.
|
| 144 |
pad_token_id=self.processor.tokenizer.eos_token_id,
|
| 145 |
)
|
| 146 |
|
|
@@ -165,17 +164,17 @@ class VisionProcessor:
|
|
| 165 |
logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
|
| 166 |
if self.device == "cuda":
|
| 167 |
torch.cuda.empty_cache()
|
| 168 |
-
if attempt < self.
|
| 169 |
-
time.sleep(self.
|
| 170 |
else:
|
| 171 |
return "GPU out of memory - vision processing failed"
|
| 172 |
|
| 173 |
except Exception as e:
|
| 174 |
logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
|
| 175 |
-
if attempt < self.
|
| 176 |
-
time.sleep(self.
|
| 177 |
else:
|
| 178 |
-
return f"Vision processing error after {self.
|
| 179 |
|
| 180 |
def _extract_response(self, full_text: str) -> str:
|
| 181 |
"""Extract the assistant's response from the full generated text."""
|
|
@@ -194,7 +193,6 @@ class VisionProcessor:
|
|
| 194 |
def get_model_info(self) -> Dict[str, Any]:
|
| 195 |
"""Get information about the loaded model."""
|
| 196 |
return {
|
| 197 |
-
"processor_type": "local",
|
| 198 |
"initialized": self._initialized,
|
| 199 |
"device": self.device,
|
| 200 |
"model_path": self.model_path,
|
|
@@ -208,14 +206,13 @@ class VisionProcessor:
|
|
| 208 |
class VisionManager:
|
| 209 |
"""Manages periodic vision processing and scene understanding."""
|
| 210 |
|
| 211 |
-
def __init__(self, camera,
|
| 212 |
"""Initialize vision manager with camera and configuration."""
|
| 213 |
self.camera = camera
|
| 214 |
-
self.
|
| 215 |
-
self.vision_interval = self.
|
| 216 |
-
self.processor =
|
| 217 |
|
| 218 |
-
self._current_description = ""
|
| 219 |
self._last_processed_time = 0
|
| 220 |
|
| 221 |
# Initialize processor
|
|
@@ -230,8 +227,8 @@ class VisionManager:
|
|
| 230 |
current_time = time.time()
|
| 231 |
|
| 232 |
if current_time - self._last_processed_time >= self.vision_interval:
|
| 233 |
-
|
| 234 |
-
if
|
| 235 |
description = await asyncio.to_thread(
|
| 236 |
lambda: self.processor.process_image(
|
| 237 |
frame, "Briefly describe what you see in one sentence."
|
|
@@ -240,7 +237,6 @@ class VisionManager:
|
|
| 240 |
|
| 241 |
# Only update if we got a valid response
|
| 242 |
if description and not description.startswith(("Vision", "Failed", "Error")):
|
| 243 |
-
self._current_description = description
|
| 244 |
self._last_processed_time = current_time
|
| 245 |
|
| 246 |
logger.info(f"Vision update: {description}")
|
|
@@ -255,29 +251,6 @@ class VisionManager:
|
|
| 255 |
|
| 256 |
logger.info("Vision loop finished")
|
| 257 |
|
| 258 |
-
async def get_current_description(self) -> str:
|
| 259 |
-
"""Get the most recent scene description (thread-safe)."""
|
| 260 |
-
return self._current_description
|
| 261 |
-
|
| 262 |
-
async def process_current_frame(self, prompt: str = "Describe what you see in detail.") -> Dict[str, Any]:
|
| 263 |
-
"""Process current camera frame with custom prompt."""
|
| 264 |
-
try:
|
| 265 |
-
success, frame = self.camera.read()
|
| 266 |
-
if not success or frame is None:
|
| 267 |
-
return {"error": "Failed to capture image from camera"}
|
| 268 |
-
|
| 269 |
-
description = await asyncio.to_thread(lambda: self.processor.process_image(frame, prompt))
|
| 270 |
-
|
| 271 |
-
return {
|
| 272 |
-
"description": description,
|
| 273 |
-
"timestamp": time.time(),
|
| 274 |
-
"prompt": prompt,
|
| 275 |
-
}
|
| 276 |
-
|
| 277 |
-
except Exception as e:
|
| 278 |
-
logger.exception("Failed to process current frame")
|
| 279 |
-
return {"error": f"Frame processing failed: {str(e)}"}
|
| 280 |
-
|
| 281 |
async def get_status(self) -> Dict[str, Any]:
|
| 282 |
"""Get comprehensive status information."""
|
| 283 |
return {
|
|
@@ -285,84 +258,59 @@ class VisionManager:
|
|
| 285 |
"processor_info": self.processor.get_model_info(),
|
| 286 |
"config": {
|
| 287 |
"interval": self.vision_interval,
|
| 288 |
-
"processor_type": self.config.processor_type,
|
| 289 |
},
|
| 290 |
}
|
| 291 |
|
| 292 |
|
| 293 |
-
def
|
| 294 |
-
"""Initialize
|
| 295 |
-
api_preference = cv2.CAP_AVFOUNDATION if sys.platform == "darwin" else 0
|
| 296 |
-
|
| 297 |
-
if simulation:
|
| 298 |
-
# Default build-in camera in SIM
|
| 299 |
-
# TODO: please, test on Linux and Windows
|
| 300 |
-
camera = cv2.VideoCapture(0, api_preference)
|
| 301 |
-
else:
|
| 302 |
-
# TODO handle macos properly
|
| 303 |
-
if sys.platform == "darwin":
|
| 304 |
-
camera = cv2.VideoCapture(camera_index, cv2.CAP_AVFOUNDATION)
|
| 305 |
-
else:
|
| 306 |
-
camera = cv2.VideoCapture(camera_index)
|
| 307 |
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
"""Create the appropriate vision processor (factory)."""
|
| 313 |
-
if config.processor_type == "openai":
|
| 314 |
-
try:
|
| 315 |
-
from .openai_vision import OpenAIVisionProcessor
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
else:
|
| 322 |
-
return VisionProcessor(config)
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 328 |
-
|
| 329 |
-
cache_dir = os.path.expandvars(os.getenv("HF_HOME", "$HOME/.cache/huggingface"))
|
| 330 |
-
|
| 331 |
-
# Only download model if using local processor
|
| 332 |
-
if processor_type == "local":
|
| 333 |
-
try:
|
| 334 |
-
os.makedirs(cache_dir, exist_ok=True)
|
| 335 |
-
os.environ["HF_HOME"] = cache_dir
|
| 336 |
-
logger.info("HF_HOME set to %s", cache_dir)
|
| 337 |
-
except Exception as e:
|
| 338 |
-
logger.warning("Failed to prepare HF cache dir %s: %s", cache_dir, e)
|
| 339 |
-
return None
|
| 340 |
-
|
| 341 |
snapshot_download(
|
| 342 |
repo_id=model_id,
|
| 343 |
repo_type="model",
|
| 344 |
cache_dir=cache_dir,
|
| 345 |
)
|
| 346 |
-
logger.info(f"
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import time
|
| 3 |
import base64
|
| 4 |
import asyncio
|
| 5 |
import logging
|
| 6 |
import threading
|
| 7 |
+
from typing import Any, Dict, Optional
|
| 8 |
from dataclasses import dataclass
|
| 9 |
|
| 10 |
import cv2
|
|
|
|
| 13 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 14 |
from huggingface_hub import snapshot_download
|
| 15 |
|
| 16 |
+
from reachy_mini_conversation_demo.config import config
|
| 17 |
+
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
|
|
| 23 |
class VisionConfig:
|
| 24 |
"""Configuration for vision processing."""
|
| 25 |
|
| 26 |
+
model_path: str = config.LOCAL_VISION_MODEL
|
|
|
|
| 27 |
vision_interval: float = 5.0
|
| 28 |
max_new_tokens: int = 64
|
|
|
|
| 29 |
jpeg_quality: int = 85
|
| 30 |
max_retries: int = 3
|
| 31 |
retry_delay: float = 1.0
|
|
|
|
| 35 |
class VisionProcessor:
|
| 36 |
"""Handles SmolVLM2 model loading and inference."""
|
| 37 |
|
| 38 |
+
def __init__(self, vision_config: VisionConfig = None):
|
| 39 |
"""Initialize the vision processor."""
|
| 40 |
+
self.vision_config = vision_config or VisionConfig()
|
| 41 |
+
self.model_path = self.vision_config.model_path
|
| 42 |
self.device = self._determine_device()
|
| 43 |
self.processor = None
|
| 44 |
self.model = None
|
| 45 |
self._initialized = False
|
| 46 |
|
| 47 |
def _determine_device(self) -> str:
|
| 48 |
+
pref = self.vision_config.device_preference
|
| 49 |
if pref == "cpu":
|
| 50 |
return "cpu"
|
| 51 |
if pref == "cuda":
|
|
|
|
| 60 |
def initialize(self) -> bool:
|
| 61 |
"""Load model and processor onto the selected device."""
|
| 62 |
try:
|
| 63 |
+
logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
|
| 64 |
self.processor = AutoProcessor.from_pretrained(self.model_path)
|
| 65 |
|
| 66 |
# Select dtype depending on device
|
|
|
|
| 97 |
if not self._initialized:
|
| 98 |
return "Vision model not initialized"
|
| 99 |
|
| 100 |
+
for attempt in range(self.vision_config.max_retries):
|
| 101 |
try:
|
| 102 |
# Convert to JPEG bytes
|
| 103 |
success, jpeg_buffer = cv2.imencode(
|
| 104 |
".jpg",
|
| 105 |
cv2_image,
|
| 106 |
+
[cv2.IMWRITE_JPEG_QUALITY, self.vision_config.jpeg_quality],
|
| 107 |
)
|
| 108 |
if not success:
|
| 109 |
return "Failed to encode image"
|
|
|
|
| 139 |
generated_ids = self.model.generate(
|
| 140 |
**inputs,
|
| 141 |
do_sample=False,
|
| 142 |
+
max_new_tokens=self.vision_config.max_new_tokens,
|
| 143 |
pad_token_id=self.processor.tokenizer.eos_token_id,
|
| 144 |
)
|
| 145 |
|
|
|
|
| 164 |
logger.error(f"CUDA OOM on attempt {attempt + 1}: {e}")
|
| 165 |
if self.device == "cuda":
|
| 166 |
torch.cuda.empty_cache()
|
| 167 |
+
if attempt < self.vision_config.max_retries - 1:
|
| 168 |
+
time.sleep(self.vision_config.retry_delay * (attempt + 1))
|
| 169 |
else:
|
| 170 |
return "GPU out of memory - vision processing failed"
|
| 171 |
|
| 172 |
except Exception as e:
|
| 173 |
logger.error(f"Vision processing failed (attempt {attempt + 1}): {e}")
|
| 174 |
+
if attempt < self.vision_config.max_retries - 1:
|
| 175 |
+
time.sleep(self.vision_config.retry_delay)
|
| 176 |
else:
|
| 177 |
+
return f"Vision processing error after {self.vision_config.max_retries} attempts"
|
| 178 |
|
| 179 |
def _extract_response(self, full_text: str) -> str:
|
| 180 |
"""Extract the assistant's response from the full generated text."""
|
|
|
|
| 193 |
def get_model_info(self) -> Dict[str, Any]:
|
| 194 |
"""Get information about the loaded model."""
|
| 195 |
return {
|
|
|
|
| 196 |
"initialized": self._initialized,
|
| 197 |
"device": self.device,
|
| 198 |
"model_path": self.model_path,
|
|
|
|
| 206 |
class VisionManager:
|
| 207 |
"""Manages periodic vision processing and scene understanding."""
|
| 208 |
|
| 209 |
+
def __init__(self, camera, vision_config: VisionConfig = None):
|
| 210 |
"""Initialize vision manager with camera and configuration."""
|
| 211 |
self.camera = camera
|
| 212 |
+
self.vision_config = vision_config or VisionConfig()
|
| 213 |
+
self.vision_interval = self.vision_config.vision_interval
|
| 214 |
+
self.processor = VisionProcessor(self.vision_config)
|
| 215 |
|
|
|
|
| 216 |
self._last_processed_time = 0
|
| 217 |
|
| 218 |
# Initialize processor
|
|
|
|
| 227 |
current_time = time.time()
|
| 228 |
|
| 229 |
if current_time - self._last_processed_time >= self.vision_interval:
|
| 230 |
+
frame = self.camera.get_latest_frame()
|
| 231 |
+
if frame is not None:
|
| 232 |
description = await asyncio.to_thread(
|
| 233 |
lambda: self.processor.process_image(
|
| 234 |
frame, "Briefly describe what you see in one sentence."
|
|
|
|
| 237 |
|
| 238 |
# Only update if we got a valid response
|
| 239 |
if description and not description.startswith(("Vision", "Failed", "Error")):
|
|
|
|
| 240 |
self._last_processed_time = current_time
|
| 241 |
|
| 242 |
logger.info(f"Vision update: {description}")
|
|
|
|
| 251 |
|
| 252 |
logger.info("Vision loop finished")
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
async def get_status(self) -> Dict[str, Any]:
|
| 255 |
"""Get comprehensive status information."""
|
| 256 |
return {
|
|
|
|
| 258 |
"processor_info": self.processor.get_model_info(),
|
| 259 |
"config": {
|
| 260 |
"interval": self.vision_interval,
|
|
|
|
| 261 |
},
|
| 262 |
}
|
| 263 |
|
| 264 |
|
| 265 |
+
def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
|
| 266 |
+
"""Initialize vision manager with model download and configuration.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
+
Args:
|
| 269 |
+
camera_worker: CameraWorker instance for frame capture
|
| 270 |
+
Returns:
|
| 271 |
+
VisionManager instance or None if initialization fails
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
"""
|
| 274 |
+
try:
|
| 275 |
+
model_id = config.LOCAL_VISION_MODEL
|
| 276 |
+
cache_dir = os.path.expanduser(config.HF_HOME)
|
|
|
|
|
|
|
| 277 |
|
| 278 |
+
# Prepare cache directory
|
| 279 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 280 |
+
os.environ["HF_HOME"] = cache_dir
|
| 281 |
+
logger.info("HF_HOME set to %s", cache_dir)
|
| 282 |
|
| 283 |
+
# Download model to cache
|
| 284 |
+
logger.info(f"Downloading vision model {model_id} to cache...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
snapshot_download(
|
| 286 |
repo_id=model_id,
|
| 287 |
repo_type="model",
|
| 288 |
cache_dir=cache_dir,
|
| 289 |
)
|
| 290 |
+
logger.info(f"Model {model_id} downloaded to {cache_dir}")
|
| 291 |
+
|
| 292 |
+
# Configure vision processing
|
| 293 |
+
vision_config = VisionConfig(
|
| 294 |
+
model_path=model_id,
|
| 295 |
+
vision_interval=5.0,
|
| 296 |
+
max_new_tokens=64,
|
| 297 |
+
jpeg_quality=85,
|
| 298 |
+
max_retries=3,
|
| 299 |
+
retry_delay=1.0,
|
| 300 |
+
device_preference="auto",
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
# Initialize vision manager
|
| 304 |
+
vision_manager = VisionManager(camera_worker, vision_config)
|
| 305 |
+
|
| 306 |
+
# Log device info
|
| 307 |
+
device_info = vision_manager.processor.get_model_info()
|
| 308 |
+
logger.info(
|
| 309 |
+
f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
return vision_manager
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"Failed to initialize vision manager: {e}")
|
| 316 |
+
return None
|
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py
CHANGED
|
@@ -94,77 +94,6 @@ class HeadTracker:
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
| 97 |
-
def get_eyes(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
|
| 98 |
-
"""Get eye positions (approximated from face bbox).
|
| 99 |
-
|
| 100 |
-
Note: YOLO only provides face bbox, so we estimate eye positions
|
| 101 |
-
|
| 102 |
-
Args:
|
| 103 |
-
img: Input image
|
| 104 |
-
|
| 105 |
-
Returns:
|
| 106 |
-
Tuple of (left_eye, right_eye) in [-1, 1] coordinates
|
| 107 |
-
|
| 108 |
-
"""
|
| 109 |
-
h, w = img.shape[:2]
|
| 110 |
-
|
| 111 |
-
# Run YOLO inference
|
| 112 |
-
results = self.model(img, verbose=False)
|
| 113 |
-
detections = Detections.from_ultralytics(results[0])
|
| 114 |
-
|
| 115 |
-
# Select best face
|
| 116 |
-
face_idx = self._select_best_face(detections)
|
| 117 |
-
if face_idx is None:
|
| 118 |
-
return None, None
|
| 119 |
-
|
| 120 |
-
bbox = detections.xyxy[face_idx]
|
| 121 |
-
|
| 122 |
-
# Estimate eye positions from face bbox (approximate locations)
|
| 123 |
-
face_width = bbox[2] - bbox[0]
|
| 124 |
-
face_height = bbox[3] - bbox[1]
|
| 125 |
-
|
| 126 |
-
# Eye positions are roughly at 1/3 and 2/3 of face width, 1/3 of face height
|
| 127 |
-
eye_y = bbox[1] + face_height * 0.35
|
| 128 |
-
left_eye_x = bbox[0] + face_width * 0.35
|
| 129 |
-
right_eye_x = bbox[0] + face_width * 0.65
|
| 130 |
-
|
| 131 |
-
# Convert to MediaPipe coordinates
|
| 132 |
-
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 133 |
-
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 134 |
-
|
| 135 |
-
return left_eye, right_eye
|
| 136 |
-
|
| 137 |
-
def get_eyes_from_landmarks(self, face_landmarks) -> Tuple[np.ndarray, np.ndarray]:
|
| 138 |
-
"""Compatibility method - YOLO doesn't have landmarks, so we store bbox in the object."""
|
| 139 |
-
if not hasattr(face_landmarks, "_bbox") or not hasattr(face_landmarks, "_img_shape"):
|
| 140 |
-
raise ValueError("Face landmarks object missing required attributes")
|
| 141 |
-
|
| 142 |
-
bbox = face_landmarks._bbox
|
| 143 |
-
h, w = face_landmarks._img_shape[:2]
|
| 144 |
-
|
| 145 |
-
# Estimate eyes from stored bbox
|
| 146 |
-
face_width = bbox[2] - bbox[0]
|
| 147 |
-
face_height = bbox[3] - bbox[1]
|
| 148 |
-
|
| 149 |
-
eye_y = bbox[1] + face_height * 0.35
|
| 150 |
-
left_eye_x = bbox[0] + face_width * 0.35
|
| 151 |
-
right_eye_x = bbox[0] + face_width * 0.65
|
| 152 |
-
|
| 153 |
-
left_eye = np.array([(left_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 154 |
-
right_eye = np.array([(right_eye_x / w) * 2 - 1, (eye_y / h) * 2 - 1], dtype=np.float32)
|
| 155 |
-
|
| 156 |
-
return left_eye, right_eye
|
| 157 |
-
|
| 158 |
-
def get_eye_center(self, face_landmarks) -> np.ndarray:
|
| 159 |
-
"""Get center point between estimated eyes."""
|
| 160 |
-
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 161 |
-
return np.mean([left_eye, right_eye], axis=0)
|
| 162 |
-
|
| 163 |
-
def get_roll(self, face_landmarks) -> float:
|
| 164 |
-
"""Estimate roll from eye positions (will be 0 for YOLO since we estimate symmetric eyes)."""
|
| 165 |
-
left_eye, right_eye = self.get_eyes_from_landmarks(face_landmarks)
|
| 166 |
-
return float(np.arctan2(right_eye[1] - left_eye[1], right_eye[0] - left_eye[0]))
|
| 167 |
-
|
| 168 |
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 169 |
"""Get head position from face detection.
|
| 170 |
|
|
@@ -204,18 +133,3 @@ class HeadTracker:
|
|
| 204 |
except Exception as e:
|
| 205 |
logger.error(f"Error in head position detection: {e}")
|
| 206 |
return None, None
|
| 207 |
-
|
| 208 |
-
def cleanup(self):
|
| 209 |
-
"""Clean up resources."""
|
| 210 |
-
if hasattr(self, "model"):
|
| 211 |
-
del self.model
|
| 212 |
-
logger.info("YOLO model cleaned up")
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
class FaceLandmarks:
|
| 216 |
-
"""Simple container for face detection results to maintain API compatibility."""
|
| 217 |
-
|
| 218 |
-
def __init__(self, bbox: np.ndarray, img_shape: tuple):
|
| 219 |
-
"""Initialize with bounding box and image shape."""
|
| 220 |
-
self._bbox = bbox
|
| 221 |
-
self._img_shape = img_shape
|
|
|
|
| 94 |
|
| 95 |
return np.array([norm_x, norm_y], dtype=np.float32)
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
|
| 98 |
"""Get head position from face detection.
|
| 99 |
|
|
|
|
| 133 |
except Exception as e:
|
| 134 |
logger.error(f"Error in head position detection: {e}")
|
| 135 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|