switching to high quality piper tts and added label translations

2026-01-29 23:48:19 +01:00
commit d80c619df9
3934 changed files with 1451600 additions and 0 deletions
@@ -0,0 +1,138 @@
+from fastapi import FastAPI, UploadFile, File, Request
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse, FileResponse
+import cv2
+import os
+import time
+import socket
+import sys
+from .vision import analyze_image_with_yolo
+
+app = FastAPI(title="Home-Bro Brain")
+
+import subprocess
+
+def play_voice(text: str):
+    """Generates speech using Piper and plays it on Mac."""
+    def _speak():
+        try:
+            print(f"DEBUG: Piper TTS für: '{text}'")
+            # Pfad zum Piper Binary im Python 3.12 venv (funktioniert auf M-Mac)
+            piper_path = "./venv_piper/bin/piper"
+            model_path = "./piper/models/de_DE-thorsten-high.onnx"
+            
+            # Piper auf Mac nutzt afplay für die Ausgabe
+            # Wir streamen stdout direkt zu afplay
+            command = (
+                f"echo '{text}' | "
+                f"{piper_path} --model {model_path} --output-raw | "
+                f"afplay --channels 1 --rate 22050 --format linear_pcm --bits 16"
+            )
+            
+            # Da afplay kein 'raw' Format direkt von stdin im richtigen Takt nimmt, 
+            # ist es sicherer, kurz ein WAV zu schreiben.
+            wav_path = "snapshots/speech.wav"
+            gen_command = (
+                f"echo '{text}' | "
+                f"{piper_path} --model {model_path} --output_file {wav_path}"
+            )
+            
+            subprocess.run(gen_command, shell=True, check=True)
+            
+            if sys.platform == "darwin":
+                subprocess.run(["afplay", wav_path], check=True)
+                
+        except Exception as e:
+            print(f"Piper TTS Fehler: {e}")
+
+    # In einem separaten Thread ausführen
+    import threading
+    threading.Thread(target=_speak).start()
+
+# Statische Dateien (Frontend)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+app.mount("/snapshots", StaticFiles(directory="snapshots"), name="snapshots")
+
+# In-Memory Speicher für den letzten Status
+latest_status = {
+    "room": "Warten...",
+    "comment": "Noch keine Daten empfangen.",
+    "timestamp": None,
+    "image_url": None
+}
+
+@app.get("/", response_class=HTMLResponse)
+async def get_index():
+    return FileResponse("static/index.html")
+
+@app.get("/api/latest")
+async def get_latest():
+    return latest_status
+
+@app.get("/api/info")
+async def get_info():
+    hostname = socket.gethostname()
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(("8.8.8.8", 80))
+        local_ip = s.getsockname()[0]
+        s.close()
+    except Exception:
+        local_ip = "127.0.0.1"
+    
+    return {
+        "hostname": hostname,
+        "ip": local_ip,
+        "port": 8000
+    }
+
+@app.post("/analyze/pi")
+async def analyze_pi(file: UploadFile = File(...)):
+    global latest_status
+    # Bild vom Pi speichern
+    path = "snapshots/pi_last.jpg"
+    content = await file.read()
+    with open(path, "wb") as f:
+        f.write(content)
+    
+    result = analyze_image_with_yolo(path)
+    play_voice(result)
+    
+    # Status aktualisieren
+    latest_status = {
+        "room": "Wohnzimmer (Pi)",
+        "comment": result,
+        "timestamp": time.strftime("%H:%M:%S"),
+        "image_url": f"/snapshots/pi_last.jpg?t={int(time.time())}" # Cache busting
+    }
+    
+    return {"room": "Wohnzimmer", "comment": result}
+
+@app.get("/analyze/tapo/{room_name}")
+async def analyze_tapo(room_name: str, ip: str):
+    global latest_status
+    # RTSP Zugriff auf die Tapo
+    user = os.getenv("TAPO_USER")
+    pw = os.getenv("TAPO_PASSWORD")
+    url = f"rtsp://{user}:{pw}@{ip}:554/stream1"
+    
+    cap = cv2.VideoCapture(url)
+    ret, frame = cap.read()
+    if ret:
+        path = f"snapshots/{room_name}_tapo.jpg"
+        cv2.imwrite(path, frame)
+        result = analyze_image_with_yolo(path)
+        play_voice(result)
+        cap.release()
+        
+        latest_status = {
+            "room": room_name,
+            "comment": result,
+            "timestamp": time.strftime("%H:%M:%S"),
+            "image_url": f"/snapshots/{room_name}_tapo.jpg?t={int(time.time())}"
+        }
+        
+        return {"room": room_name, "comment": result}
+    
+    cap.release()
+    return {"error": "Tapo nicht erreichbar"}
@@ -0,0 +1,73 @@
+from ultralytics import YOLO
+import random
+
+# Lädt das Modell (Nano-Version ist schnell & reicht für Tassen/Teller)
+model = YOLO("models/yolo11n.pt")
+
+# Deutsch-Übersetzung für gängige Objekte
+label_map = {
+    "cup": "eine Tasse",
+    "bottle": "eine Flasche",
+    "cell phone": "ein Handy",
+    "person": "einen Menschen (vermutlich Matthias, der schon wieder nichts tut)",
+    "laptop": "einen Laptop",
+    "chair": "einen Stuhl",
+    "remote": "eine Fernbedienung",
+    "keyboard": "eine Tastatur",
+    "mouse": "eine Maus",
+    "bicycle": "ein Fahrrad"
+}
+
+def analyze_image_with_yolo(image_path):
+    results = model.predict(source=image_path, conf=0.45)
+    
+    found_items = []
+    for result in results:
+        for box in result.boxes:
+            label = result.names[int(box.cls)]
+            found_items.append(label)
+
+    description = generate_description(found_items)
+    insult = generate_insult(found_items)
+    
+    return f"{description} {insult}"
+
+def generate_description(items):
+    if not items:
+        return "Ich sehe absolut gar nichts."
+    
+    translated_items = [label_map.get(item, f"ein {item}") for item in set(items)]
+    
+    if len(translated_items) == 1:
+        return f"Ich sehe {translated_items[0]}."
+    
+    last_item = translated_items.pop()
+    return f"Ich sehe {', '.join(translated_items)} und {last_item}."
+
+def generate_insult(items):
+    # Mapping von Objekten zu nervigen Kommentaren
+    insults = {
+        "cup": [
+            "Matthias, die Tasse auf dem Tisch hat mittlerweile ein eigenes Ökosystem. Räum sie weg!",
+            "Ist das Kunst oder kann das weg? Ich meine die Tasse, Matthias.",
+            "Noch eine Tasse? Willst du ein Café eröffnen oder bist du einfach nur faul?"
+        ],
+        "bottle": [
+            "Leergut gehört in die Kiste, nicht in mein Sichtfeld.",
+            "Flasche leer, Kopf leer? Bring das Ding weg."
+        ],
+        "cell phone": [
+            "Schon wieder am Handy? Kein Wunder, dass hier nichts vorangeht.",
+            "Digital Detox würde dir gut tun, Matthias. Leg das Ding weg."
+        ]
+    }
+
+    # Schauen, ob wir was zum Meckern finden
+    for item, phrases in insults.items():
+        if item in items:
+            return random.choice(phrases)
+    
+    if not items:
+        return "Ich bin mir aber sicher, du hast irgendwo Dreck versteckt."
+    
+    return "Eigentlich sieht alles okay aus... Das macht mich erst recht misstrauisch."