master #2

Open
jimmy wants to merge 17 commits from master into pi
7 changed files with 199 additions and 16 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
.env
image.png
image.png
images/

View File

@ -1,14 +1,20 @@
# Speech to Image
## Pi Setup
https://forums.raspberrypi.com/viewtopic.php?t=330358
https://learn.adafruit.com/adafruit-i2s-mems-microphone-breakout/raspberry-pi-wiring-test
https://github.com/alphacep/vosk-server
``` sudo apt install python3-pip git python3-pyaudio vlc```
``` sudo pip3 install websockets sounddevice```
```sudo apt install python3-pip git python3-pyaudio vlc```
```sudo pip3 install sounddevice aiohttp aiofiles python-vlc termcolor vosk websockets```
```sudo docker-compose up -d vosk```
## Image Server
https://huggingface.co/CompVis/stable-diffusion-v1-4
@ -23,4 +29,6 @@ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
```
sudo apt-get update
sudo apt-get install -y nvidia-docker2
```
```
```sudo docker-compose up -d --build imageserver```

View File

@ -5,11 +5,19 @@ services:
image: imageserver
build: imageserver
runtime: nvidia
command: python3 /main.py
ports:
- 8000:8000
env_file:
- .env
volumes:
- ./images:/images
- ./notebooks:/notebooks
- ./models:/root/.cache/huggingface/diffusers/
ports:
- 8888:8888
- 8000:8000
vosk:
image: alphacep/kaldi-en
ports:
- 2700:2700
- 2700:2700

View File

@ -1,8 +1,14 @@
FROM nvidia/cuda:11.6.0-base-ubuntu20.04
RUN apt-get update && apt-get install python3 python3-pip -y
RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python && \
RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python fastapi uvicorn httplib2 && \
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117 && \
pip install torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f https://download.pytorch.org/whl/torch_stable.html
COPY main.py /main.py
WORKDIR /app
COPY main.py /app/main.py
VOLUME /root/.cache/huggingface/diffusers/
CMD [ "uvicorn", "main:app", "--host", "0.0.0.0" ]

View File

@ -1,19 +1,55 @@
from multiprocessing import context
from httplib2 import Response
import torch
import uuid
import os
from diffusers import StableDiffusionPipeline
from dotenv import load_dotenv
from os import getenv
from fastapi import FastAPI, Response, HTTPException
from pydantic import BaseModel
import io
from PIL.PngImagePlugin import PngInfo
load_dotenv()
# get your token at https://huggingface.co/settings/tokens
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=getenv("TOKEN"))
pipe.to("cuda")
prompt = "metal buttons are often soldiers who just got out of high school or a couple of years graduated from college easy as an air conditioned box about radar the patriot radar known as the a n n e e e pi this is an extremely powerful radar unit so powerful that they actually"
class Text(BaseModel):
text: str
for _ in range(10):
image = pipe(prompt)["sample"][0]
image.save(f"{uuid.uuid4()}.png".replace(" ", "_"))
app = FastAPI()
@app.get("/",
responses = {
200: {
"content": {"image/png": {}}
}
},
response_class=Response
)
def root(text: str):
prompt = text.replace('+', ' ')
print(prompt)
try:
generator = torch.Generator("cuda").manual_seed(1024)
resp = pipe(prompt)
print(resp)
image = resp.images[0]
except RuntimeError as e:
raise HTTPException(status_code=202, detail="Busy")
except:
raise HTTPException(status_code=504)
metadata = PngInfo()
metadata.add_text("text", prompt)
image.save(f'/images/{str(uuid.uuid4())}.png', pnginfo=metadata)
imgByteArr = io.BytesIO()
image.save(imgByteArr, format="PNG")
imgByteArr = imgByteArr.getvalue()
running = False
return Response(content=imgByteArr, media_type="image/png")

124
pi/local.py Normal file
View File

@ -0,0 +1,124 @@
#!/usr/bin/env python3
import argparse
import queue
import sys
import json
import asyncio
import sounddevice as sd
from termcolor import colored
import aiohttp
import aiofiles
import vlc
from time import sleep
from vosk import Model, KaldiRecognizer
q = queue.Queue()
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
def callback(indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
if status:
print(status, file=sys.stderr)
q.put(bytes(indata))
async def main():
vlc_instance = vlc.Instance()
player = vlc_instance.media_player_new()
media = vlc_instance.media_new("image.png")
player.set_media(media)
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
"-l", "--list-devices", action="store_true",
help="show list of audio devices and exit")
args, remaining = parser.parse_known_args()
if args.list_devices:
print(sd.query_devices())
parser.exit(0)
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[parser])
parser.add_argument(
"-f", "--filename", type=str, metavar="FILENAME",
help="audio file to store recording to")
parser.add_argument(
"-d", "--device", type=int_or_str,
help="input device (numeric ID or substring)")
parser.add_argument(
"-r", "--samplerate", type=int, help="sampling rate")
args = parser.parse_args(remaining)
try:
if args.samplerate is None:
device_info = sd.query_devices(args.device, "input")
# soundfile expects an int, sounddevice provides a float:
args.samplerate = int(device_info["default_samplerate"])
model = Model(lang="en-us")
if args.filename:
dump_fn = open(args.filename, "wb")
else:
dump_fn = None
with sd.RawInputStream(samplerate=args.samplerate, blocksize = 8000, device=args.device,
dtype="int16", channels=1, callback=callback):
print("#" * 80)
print("Press Ctrl+C to stop the recording")
print("#" * 80)
rec = KaldiRecognizer(model, args.samplerate)
while True:
data = q.get()
if rec.AcceptWaveform(data):
print(rec.Result())
j = json.loads(rec.Result())
if "text" in j and "result" in j:
n = 0
for word in j["result"]:
n += float(word["conf"])
if float(word["conf"]) > 0.7:
print(colored(word["word"], "green"), end=" ")
elif float(word["conf"]) > 0.5:
print(colored(word["word"], "yellow"), end=" ")
else:
print(colored(word["word"], "red"), end=" ")
print(n/len(j["result"]))
print("Generating Image")
if len(j["result"]) > 2:
async with aiohttp.ClientSession() as session:
url = f'http://192.168.1.95:8000?text={j["text"].replace(" ", "+")}'
async with session.get(url) as resp:
print(resp.status)
if resp.status == 200:
f = await aiofiles.open('image.png', mode='wb')
await f.write(await resp.read())
await f.close()
print("Image generated")
player.stop()
player.play()
sleep(1)
player.pause()
# else:
# print(rec.PartialResult())
if dump_fn is not None:
dump_fn.write(data)
except KeyboardInterrupt:
print("\nDone")
parser.exit(0)
except Exception as e:
parser.exit(type(e).__name__ + ": " + str(e))
if __name__ == '__main__':
asyncio.run(main())

View File

@ -43,10 +43,10 @@ async def run_test():
if "text" in j and "result" in j:
n = 0
for word in j["result"]:
n += int(word["conf"])
if int(word["conf"]) > 0.7:
n += float(word["conf"])
if float(word["conf"]) > 0.7:
print(colored(word["word"], "green"), end=" ")
elif int(word["conf"]) > 0.7:
elif float(word["conf"]) > 0.5:
print(colored(word["word"], "yellow"), end=" ")
else:
print(colored(word["word"], "red"), end=" ")