Speech-to-Sticker
Thermal printer + GPT-3 + Dalle-2, MJ, SD, etc.
"Speech-to-sticker" is a fun application of the recent latent diffusion and language model craze. This little fluff box houses a battery, Bluetooth microphone and a Bluetooth thermal printer.
It prints a small sticker from voice commands, just remember to say "please"
A transcription is sent to GPT-3, a large language model to extract grammatically correct descriptions of images and filter out extraneous words and background conversation. Then a series of "prompts" are sent to OpenAI's Dalle-2 (because it has a convenient API) to fetch images to print.
Although this prototype currently requires three separate web services and a mobile device, it could be made to run offline by using a Raspberry Pi with a Coral USB Accelerator or the now discontinued Intel® Neural Compute Stick and running a CPP/TFLite version of OpenAI's whisper speech-to-text model.
For now, latent diffusion models are not able to run on constrained hardware. So a serverless function hosted on AWS Lambda is required:
I'm using an Android mobile device to send POST request to a NodeJS AWS Lambda function to forward requests to GPT-3 and parse responses as Dalle-2 API requests, before finally compositing them together in a single image and uploaded to S3. The url is thne returned back to the Android mobile device which then prints the image over bluetooth.
Start by creating a NodeJS function configured with an API key from OpenAI.
import { Configuration, OpenAIApi } from "openai";
import { joinImages } from 'join-images';
import AWS from 'aws-sdk';
import fetch from "node-fetch";
import fs from 'fs-extra';
import { outputFile, emptyDir } from 'fs-extra/esm'
import sharp from 'sharp'
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY,
});
const openai = new OpenAIApi(configuration);
function putObjectToS3(bucket, key, data) {
const blob = fs.readFileSync(data)
var s3 = new AWS.S3();
var params = {
Bucket: bucket,
Key: key,
Body: blob,
ACL: 'public-read'
}
// get location of s3 put object
return new Promise((resolve, reject) => {
s3.putObject(params, function (err, data) {
if (err) {
console.log(err)
reject(err)
} else {
console.log(data)
resolve(data)
}
})
})
}
const downloadImage = async (url, path) => {
const response = await fetch(url);
const blob = await response.blob();
const arrayBuffer = await blob.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
return outputFile(path, buffer);
}
const requestImages = async (prompt_array) => {
let promises = prompt_array.reverse().map((prompt) => {
return openai.createImage({
prompt: `A beautiful hand drawn illustration of ${prompt}. black and white, minimal line drawing, no background`,
n: 1,
size: "512x512"
})
})
let dalle_responses = await Promise.allSettled(promises);
return dalle_responses.reduce((result, res) => {
if (res.status === 'fulfilled') {
result.successfullyFetchedImages.push(res.value)
} else {
result.failedImages.push(res.reason)
}
return result;
}, {
successfullyFetchedImages: [],
failedImages: [],
})
}
export const handler = async (event) => {
sharp.cache(false)
try {
await fs.emptyDir('/tmp')
} catch (err) {
console.error(err)
}
let body = JSON.parse(event.body);
const gpt_response = await openai.createCompletion({
model: "text-davinci-003",
prompt: `the following is a speech-to-text transcript of us children requesting images to print. Filter our transcript for image requests, and rewrite them as one subject per image (grammatically correct).\n\ntranscript: I would love a beautiful picture of a boat on a river. ${body.transcript}\n\nrequests (array of json objects):\n[{\"grammatically_correct_request\": \"a boat on a river\"}, {\"grammatically_correct_request\":`,
max_tokens: 512,
temperature: 0.8,
});
let prompt_array = Object.values(JSON.parse(`[{"grammatically_correct_request":${gpt_response.data.choices[0].text.replaceAll("'", "\'")}`).map((item) => item.grammatically_correct_request))
const {successfullyFetchedImages} = await requestImages(prompt_array)
await Promise.all(successfullyFetchedImages.map(async (image) => {
let filename = Date.now().toString(36)
return downloadImage(image.data.data[0].url, `/tmp/${filename}.jpg`)
}))
await joinImages(fs.readdirSync('/tmp').map(f=>`/tmp/${f}`)).then(async (img) => {
// rotate image 180 degrees
await img.toFile('/tmp/out.jpg')
await sharp('/tmp/out.jpg').rotate(180).toFile('/tmp/out_180.jpg')
})
let s3name = `${prompt_array.join("-").replace(/[^a-zA-Z0-9]/g, '-').replaceAll(' ', '-')}.jpg`
await putObjectToS3("speechtosticker", s3name, "/tmp/out_180.jpg")
const response = {
statusCode: 200,
body: JSON.stringify(`https://speechtosticker.s3.amazonaws.com/${s3name}`)
}
return response;
};