7 changed files with 199 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .env
-image.png
+image.png
+images/
--- a/README.md
+++ b/README.md
@ -1,14 +1,20 @@
 # Speech to Image

+## Pi Setup
+
+https://forums.raspberrypi.com/viewtopic.php?t=330358
+
 https://learn.adafruit.com/adafruit-i2s-mems-microphone-breakout/raspberry-pi-wiring-test

 https://github.com/alphacep/vosk-server

-``` sudo apt install python3-pip git python3-pyaudio vlc```
-``` sudo pip3 install websockets sounddevice```
+```sudo apt install python3-pip git python3-pyaudio vlc```

+```sudo pip3 install sounddevice aiohttp aiofiles python-vlc termcolor vosk websockets```

+```sudo docker-compose up -d vosk```

+## Image Server

 https://huggingface.co/CompVis/stable-diffusion-v1-4

@ -23,4 +29,6 @@ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 ```
 sudo apt-get update
 sudo apt-get install -y nvidia-docker2
-```
+```
+
+```sudo docker-compose up -d --build imageserver```
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -5,11 +5,19 @@ services:
    image: imageserver
    build: imageserver
    runtime: nvidia
-    command: python3 /main.py
+    ports:
+      - 8000:8000
    env_file:
      - .env
+    volumes:
+      - ./images:/images
+      - ./notebooks:/notebooks
+      - ./models:/root/.cache/huggingface/diffusers/
+    ports:
+      - 8888:8888
+      - 8000:8000

  vosk:
    image: alphacep/kaldi-en
    ports:
-      - 2700:2700
+      - 2700:2700
--- a/imageserver/Dockerfile
+++ b/imageserver/Dockerfile
@ -1,8 +1,14 @@
 FROM nvidia/cuda:11.6.0-base-ubuntu20.04
 RUN apt-get update && apt-get install python3 python3-pip -y

-RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python && \
+RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python fastapi uvicorn httplib2 && \
    pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117 && \
    pip install torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f https://download.pytorch.org/whl/torch_stable.html

-COPY main.py /main.py
+WORKDIR /app
+
+COPY main.py /app/main.py
+VOLUME /root/.cache/huggingface/diffusers/
+
+
+CMD [ "uvicorn", "main:app", "--host", "0.0.0.0" ]
--- a/imageserver/main.py
+++ b/imageserver/main.py
@ -1,19 +1,55 @@
+from multiprocessing import context
+from httplib2 import Response
 import torch
 import uuid
 import os
 from diffusers import StableDiffusionPipeline
 from dotenv import load_dotenv
 from os import getenv
+from fastapi import FastAPI, Response, HTTPException
+from pydantic import BaseModel
+import io
+from PIL.PngImagePlugin import PngInfo

 load_dotenv()

-# get your token at https://huggingface.co/settings/tokens
 pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=getenv("TOKEN"))
 pipe.to("cuda")

-prompt = "metal buttons are often soldiers who just got out of high school or a couple of years graduated from college easy as an air conditioned box about radar the patriot radar known as the a n n e e e pi this is an extremely powerful radar unit so powerful that they actually"
+class Text(BaseModel):
+    text: str

-for _ in range(10):
-    image = pipe(prompt)["sample"][0]

-    image.save(f"{uuid.uuid4()}.png".replace(" ", "_"))
+
+app = FastAPI()
+
+@app.get("/",
+    responses = {
+        200: {
+            "content": {"image/png": {}}
+        }
+    },
+    response_class=Response
+)
+def root(text: str):
+    prompt = text.replace('+', ' ')
+    print(prompt)
+    try:
+        generator = torch.Generator("cuda").manual_seed(1024)
+        resp = pipe(prompt)
+        print(resp)
+        image = resp.images[0]
+    except RuntimeError as e:
+        raise HTTPException(status_code=202, detail="Busy")
+    except:
+        raise HTTPException(status_code=504)
+
+    metadata = PngInfo()
+    metadata.add_text("text", prompt)
+    image.save(f'/images/{str(uuid.uuid4())}.png', pnginfo=metadata)    
+    
+    imgByteArr = io.BytesIO()
+    image.save(imgByteArr, format="PNG")
+    imgByteArr = imgByteArr.getvalue()
+    running = False
+    return Response(content=imgByteArr, media_type="image/png")
--- a/pi/local.py
+++ b/pi/local.py
@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+import argparse
+import queue
+import sys
+import json
+import asyncio
+import sounddevice as sd
+from termcolor import colored
+import aiohttp
+import aiofiles
+import vlc
+from time import sleep
+
+from vosk import Model, KaldiRecognizer
+
+q = queue.Queue()
+
+def int_or_str(text):
+    """Helper function for argument parsing."""
+    try:
+        return int(text)
+    except ValueError:
+        return text
+
+def callback(indata, frames, time, status):
+    """This is called (from a separate thread) for each audio block."""
+    if status:
+        print(status, file=sys.stderr)
+    q.put(bytes(indata))
+
+
+async def main():
+    vlc_instance = vlc.Instance()
+    player = vlc_instance.media_player_new()
+    media = vlc_instance.media_new("image.png")
+    player.set_media(media)  
+
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument(
+        "-l", "--list-devices", action="store_true",
+        help="show list of audio devices and exit")
+    args, remaining = parser.parse_known_args()
+    if args.list_devices:
+        print(sd.query_devices())
+        parser.exit(0)
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        parents=[parser])
+    parser.add_argument(
+        "-f", "--filename", type=str, metavar="FILENAME",
+        help="audio file to store recording to")
+    parser.add_argument(
+        "-d", "--device", type=int_or_str,
+        help="input device (numeric ID or substring)")
+    parser.add_argument(
+        "-r", "--samplerate", type=int, help="sampling rate")
+    args = parser.parse_args(remaining)
+
+    try:
+        if args.samplerate is None:
+            device_info = sd.query_devices(args.device, "input")
+            # soundfile expects an int, sounddevice provides a float:
+            args.samplerate = int(device_info["default_samplerate"])
+
+        model = Model(lang="en-us")
+
+        if args.filename:
+            dump_fn = open(args.filename, "wb")
+        else:
+            dump_fn = None
+
+        with sd.RawInputStream(samplerate=args.samplerate, blocksize = 8000, device=args.device,
+                dtype="int16", channels=1, callback=callback):
+            print("#" * 80)
+            print("Press Ctrl+C to stop the recording")
+            print("#" * 80)
+
+            rec = KaldiRecognizer(model, args.samplerate)
+            while True:
+                data = q.get()
+                if rec.AcceptWaveform(data):
+                    print(rec.Result())
+                    j = json.loads(rec.Result())
+                    if "text" in j and "result" in j:
+                        n = 0
+                        for word in j["result"]: 
+                            n += float(word["conf"])
+                            if float(word["conf"]) > 0.7:
+                                print(colored(word["word"], "green"), end=" ")
+                            elif float(word["conf"]) > 0.5:
+                                print(colored(word["word"], "yellow"), end=" ")
+                            else:
+                                print(colored(word["word"], "red"), end=" ")
+                        print(n/len(j["result"])) 
+                        print("Generating Image")
+                        if len(j["result"]) > 2: 
+                            async with aiohttp.ClientSession() as session:
+                                url = f'http://192.168.1.95:8000?text={j["text"].replace(" ", "+")}'
+                                async with session.get(url) as resp:
+                                    print(resp.status)
+                                    if resp.status == 200:
+                                        f = await aiofiles.open('image.png', mode='wb')
+                                        await f.write(await resp.read())
+                                        await f.close()
+                            print("Image generated")
+                            player.stop()
+                            player.play()
+                            sleep(1)
+                            player.pause()
+                # else:
+                #     print(rec.PartialResult())
+                if dump_fn is not None:
+                    dump_fn.write(data)
+
+    except KeyboardInterrupt:
+        print("\nDone")
+        parser.exit(0)
+    except Exception as e:
+        parser.exit(type(e).__name__ + ": " + str(e))
+
+if __name__ == '__main__':
+    asyncio.run(main())
--- a/pi/test_microphone.py
+++ b/pi/test_microphone.py
@ -43,10 +43,10 @@ async def run_test():
                if "text" in j and "result" in j:
                    n = 0
                    for word in j["result"]: 
-                        n += int(word["conf"])
-                        if int(word["conf"]) > 0.7:
+                        n += float(word["conf"])
+                        if float(word["conf"]) > 0.7:
                            print(colored(word["word"], "green"), end=" ")
-                        elif int(word["conf"]) > 0.7:
+                        elif float(word["conf"]) > 0.5:
                            print(colored(word["word"], "yellow"), end=" ")
                        else:
                            print(colored(word["word"], "red"), end=" ")