master #2
|
@ -1,2 +1,3 @@
|
|||
.env
|
||||
image.png
|
||||
image.png
|
||||
images/
|
14
README.md
14
README.md
|
@ -1,14 +1,20 @@
|
|||
# Speech to Image
|
||||
|
||||
## Pi Setup
|
||||
|
||||
https://forums.raspberrypi.com/viewtopic.php?t=330358
|
||||
|
||||
https://learn.adafruit.com/adafruit-i2s-mems-microphone-breakout/raspberry-pi-wiring-test
|
||||
|
||||
https://github.com/alphacep/vosk-server
|
||||
|
||||
``` sudo apt install python3-pip git python3-pyaudio vlc```
|
||||
``` sudo pip3 install websockets sounddevice```
|
||||
```sudo apt install python3-pip git python3-pyaudio vlc```
|
||||
|
||||
```sudo pip3 install sounddevice aiohttp aiofiles python-vlc termcolor vosk websockets```
|
||||
|
||||
```sudo docker-compose up -d vosk```
|
||||
|
||||
## Image Server
|
||||
|
||||
https://huggingface.co/CompVis/stable-diffusion-v1-4
|
||||
|
||||
|
@ -23,4 +29,6 @@ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
|
|||
```
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nvidia-docker2
|
||||
```
|
||||
```
|
||||
|
||||
```sudo docker-compose up -d --build imageserver```
|
||||
|
|
|
@ -5,11 +5,19 @@ services:
|
|||
image: imageserver
|
||||
build: imageserver
|
||||
runtime: nvidia
|
||||
command: python3 /main.py
|
||||
ports:
|
||||
- 8000:8000
|
||||
env_file:
|
||||
- .env
|
||||
volumes:
|
||||
- ./images:/images
|
||||
- ./notebooks:/notebooks
|
||||
- ./models:/root/.cache/huggingface/diffusers/
|
||||
ports:
|
||||
- 8888:8888
|
||||
- 8000:8000
|
||||
|
||||
vosk:
|
||||
image: alphacep/kaldi-en
|
||||
ports:
|
||||
- 2700:2700
|
||||
- 2700:2700
|
||||
|
|
|
@ -1,8 +1,14 @@
|
|||
FROM nvidia/cuda:11.6.0-base-ubuntu20.04
|
||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||
|
||||
RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python && \
|
||||
RUN pip3 install --upgrade diffusers transformers scipy python-dotenv cuda-python fastapi uvicorn httplib2 && \
|
||||
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117 && \
|
||||
pip install torch==1.11.0+cu115 torchvision==0.12.0+cu115 torchaudio==0.11.0+cu115 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
|
||||
COPY main.py /main.py
|
||||
WORKDIR /app
|
||||
|
||||
COPY main.py /app/main.py
|
||||
VOLUME /root/.cache/huggingface/diffusers/
|
||||
|
||||
|
||||
CMD [ "uvicorn", "main:app", "--host", "0.0.0.0" ]
|
|
@ -1,19 +1,55 @@
|
|||
from multiprocessing import context
|
||||
from httplib2 import Response
|
||||
import torch
|
||||
import uuid
|
||||
import os
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from dotenv import load_dotenv
|
||||
from os import getenv
|
||||
from fastapi import FastAPI, Response, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import io
|
||||
from PIL.PngImagePlugin import PngInfo
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# get your token at https://huggingface.co/settings/tokens
|
||||
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=getenv("TOKEN"))
|
||||
pipe.to("cuda")
|
||||
|
||||
prompt = "metal buttons are often soldiers who just got out of high school or a couple of years graduated from college easy as an air conditioned box about radar the patriot radar known as the a n n e e e pi this is an extremely powerful radar unit so powerful that they actually"
|
||||
class Text(BaseModel):
|
||||
text: str
|
||||
|
||||
for _ in range(10):
|
||||
image = pipe(prompt)["sample"][0]
|
||||
|
||||
image.save(f"{uuid.uuid4()}.png".replace(" ", "_"))
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/",
|
||||
responses = {
|
||||
200: {
|
||||
"content": {"image/png": {}}
|
||||
}
|
||||
},
|
||||
response_class=Response
|
||||
)
|
||||
def root(text: str):
|
||||
prompt = text.replace('+', ' ')
|
||||
print(prompt)
|
||||
try:
|
||||
generator = torch.Generator("cuda").manual_seed(1024)
|
||||
resp = pipe(prompt)
|
||||
print(resp)
|
||||
image = resp.images[0]
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=202, detail="Busy")
|
||||
except:
|
||||
raise HTTPException(status_code=504)
|
||||
|
||||
metadata = PngInfo()
|
||||
metadata.add_text("text", prompt)
|
||||
image.save(f'/images/{str(uuid.uuid4())}.png', pnginfo=metadata)
|
||||
|
||||
imgByteArr = io.BytesIO()
|
||||
image.save(imgByteArr, format="PNG")
|
||||
imgByteArr = imgByteArr.getvalue()
|
||||
running = False
|
||||
return Response(content=imgByteArr, media_type="image/png")
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import queue
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
import sounddevice as sd
|
||||
from termcolor import colored
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
import vlc
|
||||
from time import sleep
|
||||
|
||||
from vosk import Model, KaldiRecognizer
|
||||
|
||||
q = queue.Queue()
|
||||
|
||||
def int_or_str(text):
|
||||
"""Helper function for argument parsing."""
|
||||
try:
|
||||
return int(text)
|
||||
except ValueError:
|
||||
return text
|
||||
|
||||
def callback(indata, frames, time, status):
|
||||
"""This is called (from a separate thread) for each audio block."""
|
||||
if status:
|
||||
print(status, file=sys.stderr)
|
||||
q.put(bytes(indata))
|
||||
|
||||
|
||||
async def main():
|
||||
vlc_instance = vlc.Instance()
|
||||
player = vlc_instance.media_player_new()
|
||||
media = vlc_instance.media_new("image.png")
|
||||
player.set_media(media)
|
||||
|
||||
parser = argparse.ArgumentParser(add_help=False)
|
||||
parser.add_argument(
|
||||
"-l", "--list-devices", action="store_true",
|
||||
help="show list of audio devices and exit")
|
||||
args, remaining = parser.parse_known_args()
|
||||
if args.list_devices:
|
||||
print(sd.query_devices())
|
||||
parser.exit(0)
|
||||
parser = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
parents=[parser])
|
||||
parser.add_argument(
|
||||
"-f", "--filename", type=str, metavar="FILENAME",
|
||||
help="audio file to store recording to")
|
||||
parser.add_argument(
|
||||
"-d", "--device", type=int_or_str,
|
||||
help="input device (numeric ID or substring)")
|
||||
parser.add_argument(
|
||||
"-r", "--samplerate", type=int, help="sampling rate")
|
||||
args = parser.parse_args(remaining)
|
||||
|
||||
try:
|
||||
if args.samplerate is None:
|
||||
device_info = sd.query_devices(args.device, "input")
|
||||
# soundfile expects an int, sounddevice provides a float:
|
||||
args.samplerate = int(device_info["default_samplerate"])
|
||||
|
||||
model = Model(lang="en-us")
|
||||
|
||||
if args.filename:
|
||||
dump_fn = open(args.filename, "wb")
|
||||
else:
|
||||
dump_fn = None
|
||||
|
||||
with sd.RawInputStream(samplerate=args.samplerate, blocksize = 8000, device=args.device,
|
||||
dtype="int16", channels=1, callback=callback):
|
||||
print("#" * 80)
|
||||
print("Press Ctrl+C to stop the recording")
|
||||
print("#" * 80)
|
||||
|
||||
rec = KaldiRecognizer(model, args.samplerate)
|
||||
while True:
|
||||
data = q.get()
|
||||
if rec.AcceptWaveform(data):
|
||||
print(rec.Result())
|
||||
j = json.loads(rec.Result())
|
||||
if "text" in j and "result" in j:
|
||||
n = 0
|
||||
for word in j["result"]:
|
||||
n += float(word["conf"])
|
||||
if float(word["conf"]) > 0.7:
|
||||
print(colored(word["word"], "green"), end=" ")
|
||||
elif float(word["conf"]) > 0.5:
|
||||
print(colored(word["word"], "yellow"), end=" ")
|
||||
else:
|
||||
print(colored(word["word"], "red"), end=" ")
|
||||
print(n/len(j["result"]))
|
||||
print("Generating Image")
|
||||
if len(j["result"]) > 2:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f'http://192.168.1.95:8000?text={j["text"].replace(" ", "+")}'
|
||||
async with session.get(url) as resp:
|
||||
print(resp.status)
|
||||
if resp.status == 200:
|
||||
f = await aiofiles.open('image.png', mode='wb')
|
||||
await f.write(await resp.read())
|
||||
await f.close()
|
||||
print("Image generated")
|
||||
player.stop()
|
||||
player.play()
|
||||
sleep(1)
|
||||
player.pause()
|
||||
# else:
|
||||
# print(rec.PartialResult())
|
||||
if dump_fn is not None:
|
||||
dump_fn.write(data)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nDone")
|
||||
parser.exit(0)
|
||||
except Exception as e:
|
||||
parser.exit(type(e).__name__ + ": " + str(e))
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
|
@ -43,10 +43,10 @@ async def run_test():
|
|||
if "text" in j and "result" in j:
|
||||
n = 0
|
||||
for word in j["result"]:
|
||||
n += int(word["conf"])
|
||||
if int(word["conf"]) > 0.7:
|
||||
n += float(word["conf"])
|
||||
if float(word["conf"]) > 0.7:
|
||||
print(colored(word["word"], "green"), end=" ")
|
||||
elif int(word["conf"]) > 0.7:
|
||||
elif float(word["conf"]) > 0.5:
|
||||
print(colored(word["word"], "yellow"), end=" ")
|
||||
else:
|
||||
print(colored(word["word"], "red"), end=" ")
|
||||
|
|
Loading…
Reference in New Issue