diff --git a/.vscode/settings.json b/.vscode/settings.json index 0794c27..dde853f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,7 @@ "Geospatial", "Kbps", "Mbps", + "SSML", "Seeed", "Siri", "Twilio", diff --git a/6-consumer/lessons/1-speech-recognition/code-iot-hub/virtual-iot-device/smart-timer/app.py b/6-consumer/lessons/1-speech-recognition/code-iot-hub/virtual-iot-device/smart-timer/app.py index 59df663..ae111d1 100644 --- a/6-consumer/lessons/1-speech-recognition/code-iot-hub/virtual-iot-device/smart-timer/app.py +++ b/6-consumer/lessons/1-speech-recognition/code-iot-hub/virtual-iot-device/smart-timer/app.py @@ -14,11 +14,11 @@ print('Connecting') device_client.connect() print('Connected') -speech_config = SpeechConfig(subscription=api_key, - region=location, - speech_recognition_language=language) +recognizer_config = SpeechConfig(subscription=api_key, + region=location, + speech_recognition_language=language) -recognizer = SpeechRecognizer(speech_config=speech_config) +recognizer = SpeechRecognizer(speech_config=recognizer_config) def recognized(args): if len(args.result.text) > 0: diff --git a/6-consumer/lessons/1-speech-recognition/code-speech-to-text/virtual-iot-device/smart-timer/app.py b/6-consumer/lessons/1-speech-recognition/code-speech-to-text/virtual-iot-device/smart-timer/app.py index 1563265..355b9c2 100644 --- a/6-consumer/lessons/1-speech-recognition/code-speech-to-text/virtual-iot-device/smart-timer/app.py +++ b/6-consumer/lessons/1-speech-recognition/code-speech-to-text/virtual-iot-device/smart-timer/app.py @@ -5,11 +5,11 @@ api_key = '' location = '' language = '' -speech_config = SpeechConfig(subscription=api_key, - region=location, - speech_recognition_language=language) +recognizer_config = SpeechConfig(subscription=api_key, + region=location, + speech_recognition_language=language) -recognizer = SpeechRecognizer(speech_config=speech_config) +recognizer = SpeechRecognizer(speech_config=recognizer_config) def recognized(args): print(args.result.text) diff --git a/6-consumer/lessons/1-speech-recognition/virtual-device-speech-to-text.md b/6-consumer/lessons/1-speech-recognition/virtual-device-speech-to-text.md index 79618b5..02e29b8 100644 --- a/6-consumer/lessons/1-speech-recognition/virtual-device-speech-to-text.md +++ b/6-consumer/lessons/1-speech-recognition/virtual-device-speech-to-text.md @@ -45,9 +45,9 @@ On Windows, Linux, and macOS, the speech services Python SDK can be used to list location = '' language = '' - speech_config = SpeechConfig(subscription=api_key, - region=location, - speech_recognition_language=language) + recognizer_config = SpeechConfig(subscription=api_key, + region=location, + speech_recognition_language=language) ``` Replace `` with the API key for your speech service. Replace `` with the location you used when you created the speech service resource. @@ -59,7 +59,7 @@ On Windows, Linux, and macOS, the speech services Python SDK can be used to list 1. Add the following code to create a speech recognizer: ```python - recognizer = SpeechRecognizer(speech_config=speech_config) + recognizer = SpeechRecognizer(speech_config=recognizer_config) ``` 1. The speech recognizer runs on a background thread, listening for audio and converting any speech in it to text. You can get the text using a callback function - a function you define and pass to the recognizer. Every time speech is detected, the callback is called. Add the following code to define a callback that prints the text to the console, and pass this callback to the recognizer: diff --git a/6-consumer/lessons/2-language-understanding/README.md b/6-consumer/lessons/2-language-understanding/README.md index 811bd1d..29f99ae 100644 --- a/6-consumer/lessons/2-language-understanding/README.md +++ b/6-consumer/lessons/2-language-understanding/README.md @@ -347,7 +347,7 @@ Once published, the LUIS model can be called from code. In the last lesson you s if prediction_response.prediction.top_intent == 'set timer': numbers = prediction_response.prediction.entities['number'] time_units = prediction_response.prediction.entities['time unit'] - total_time = 0 + total_seconds = 0 ``` The `number` entities wil be an array of numbers. For example, if you said *"Set a four minute 17 second timer."*, then the `number` array will contain 2 integers - 4 and 17. @@ -392,15 +392,15 @@ Once published, the LUIS model can be called from code. In the last lesson you s ```python if time_unit == 'minute': - total_time += number * 60 + total_seconds += number * 60 else: - total_time += number + total_seconds += number ``` 1. Finally, outside this loop through the entities, log the total time for the timer: ```python - logging.info(f'Timer required for {total_time} seconds') + logging.info(f'Timer required for {total_seconds} seconds') ``` 1. Run the function app and speak into your IoT device. You will see the total time for the timer in the function app output: diff --git a/6-consumer/lessons/2-language-understanding/assignment.md b/6-consumer/lessons/2-language-understanding/assignment.md index 69c4645..f3c5e30 100644 --- a/6-consumer/lessons/2-language-understanding/assignment.md +++ b/6-consumer/lessons/2-language-understanding/assignment.md @@ -2,7 +2,7 @@ ## Instructions -So far in this lesson you have trained a model to understand setting a timer. Another useful feature is cancelling a timer - maybe your bread is ready and can be taken out of the oven. +So far in this lesson you have trained a model to understand setting a timer. Another useful feature is cancelling a timer - maybe your bread is ready and can be taken out of the oven before the timer is elapsed. Add a new intent to your LUIS app to cancel the timer. It won't need any entities, but will need some example sentences. Handle this in your serverless code if it is the top intent, logging that the intent was recognized. diff --git a/6-consumer/lessons/2-language-understanding/code/functions/smart-timer-trigger/speech-trigger/__init__.py b/6-consumer/lessons/2-language-understanding/code/functions/smart-timer-trigger/speech-trigger/__init__.py index e660860..1b9f3ac 100644 --- a/6-consumer/lessons/2-language-understanding/code/functions/smart-timer-trigger/speech-trigger/__init__.py +++ b/6-consumer/lessons/2-language-understanding/code/functions/smart-timer-trigger/speech-trigger/__init__.py @@ -28,16 +28,16 @@ def main(events: List[func.EventHubEvent]): if prediction_response.prediction.top_intent == 'set timer': numbers = prediction_response.prediction.entities['number'] time_units = prediction_response.prediction.entities['time unit'] - total_time = 0 + total_seconds = 0 for i in range(0, len(numbers)): number = numbers[i] time_unit = time_units[i][0] if time_unit == 'minute': - total_time += number * 60 + total_seconds += number * 60 else: - total_time += number + total_seconds += number - logging.info(f'Timer required for {total_time} seconds') + logging.info(f'Timer required for {total_seconds} seconds') diff --git a/6-consumer/lessons/3-spoken-feedback/README.md b/6-consumer/lessons/3-spoken-feedback/README.md index aac7471..9fecc41 100644 --- a/6-consumer/lessons/3-spoken-feedback/README.md +++ b/6-consumer/lessons/3-spoken-feedback/README.md @@ -26,6 +26,50 @@ In this lesson we'll cover: ## Text to speech +Text to speech, as the name suggests, is the process of converting text into audio that contains the text as spoken words. The basic principle is to break down the words in the text into their constituent sounds (known as phonemes), and stitch together audio for those sounds, either using pre-recorded audio or using audio generated by AI models. + +![The three stages of typical text to speech systems](../../../images/tts-overview.png) + +Text to speech systems typically have 3 stages: + +* Text analysis +* Linguistic analysis +* Wave-form generation + +### Text analysis + +Text analysis involves taking the text provided, and converting into words that can be used to generate speech. For example, if you convert "Hello world", there there is no text analysis needed, the two words can be converted to speech. If you have "1234" however, then this might need to be converted either into the words "One thousand, two hundred thirty four" or "One, two, three, four" depending on the context. For "I have 1234 apples", then it would be "One thousand, two hundred thirty four", but for "The child counted 1234" then it would be "One, two, three, four". + +The words created vary not only for the language, but the locale of that language. For example, in American English, 120 would be "One hundred twenty", in British English it would be "One hundred and twenty", with the use of "and" after the hundreds. + +✅ Some other examples that require text analysis include "in" as a short form of inch, and "st" as a short form of saint and street. Can you think of other examples in your language of words that are ambiguous without context. + +Once the words have been defined, they are sent for linguistic analysis. + +### Linguistic analysis + +Linguistic analysis breaks the words down into phonemes. Phonemes are based not just on the letters used, but the other letters in the word. For example, in English the 'a' sound in 'car' and 'care' is different. The English language has 44 different phonemes for the 26 letters in the alphabet, some shared by different letters, such as the same phoneme used at the start of 'circle' and 'serpent'. + +✅ Do some research: What are the phonemes for you language? + +Once the words have been converted to phonemes, these phonemes need additional data to support intonation, adjusting the tone or duration depending on the context. One example is in English pitch increases can be used to convert a sentence into a question, having a raised pitch for the last word implies a question. + +For example - the sentence "You have an apple" is a statement saying that you have an apple. If the pitch goes up at the end, increasing for the word apple, it becomes the question "You have an apple?", asking if you have an apple. The linguistic analysis needs to use the question mark at the end to decide to increase pitch. + +Once the phonemes have been generated, they can be sent for wave-form generation to produce the audio output. + +### Wave-form generation + +The first electronic text to speech systems used single audio recordings for each phoneme, leading to very monotonous, robotic sounding voices. The linguistic analysis would produce phonemes, these would be loaded from a database of sounds and stitched together to make the audio. + +✅ Do some research: Find some audio recordings from early speech synthesis systems. Compare it to modern speech synthesis, such as that used in smart assistants. + +More modern wave-form generation uses ML models built using deep learning (very large neural networks that act in a similar way to neurons in the brain) to produce more natural sounding voices that can be indistinguishable from humans. + +> 💁 Some of these ML models can be re-trained using transfer learning to sound like real people. This means using voice as a security system, something banks are increasingly trying to do, is no longer a good idea as anyone with a recording of a few minutes of your voice can impersonate you. + +These large ML models are being trained to combine all three steps into end-to-end speech synthesizers. + ## Set the timer The timer can be set by sending a command from the serverless code, instructing the IoT device to set the timer. This command will contain the time in seconds till the timer needs to go off. @@ -38,11 +82,11 @@ The timer can be set by sending a command from the serverless code, instructing You will need to set up the connection string for the IoT Hub with the service policy (*NOT* the device) in your `local.settings.json` file and add the `azure-iot-hub` pip package to your `requirements.txt` file. The device ID can be extracted from the event. -1. The direct method you send needs to be called `set-timer`, and will need to send the length of the timer as a JSON property called `time`. Use the following code to build the `CloudToDeviceMethod` using the `total_time` calculated from the data extracted by LUIS: +1. The direct method you send needs to be called `set-timer`, and will need to send the length of the timer as a JSON property called `seconds`. Use the following code to build the `CloudToDeviceMethod` using the `total_seconds` calculated from the data extracted by LUIS: ```python payload = { - 'time': total_time + 'seconds': total_seconds } direct_method = CloudToDeviceMethod(method_name='set-timer', payload=json.dumps(payload)) ``` @@ -60,11 +104,23 @@ The timer can be set by sending a command from the serverless code, instructing * [Arduino - Wio Terminal](wio-terminal-set-timer.md) * [Single-board computer - Raspberry Pi/Virtual IoT device](single-board-computer-set-timer.md) -> 💁 You can find this code in the [code-command/wio-terminal](code-command/wio-terminal), [code-command/virtual-device](code-command/virtual-device), or [code-command/pi](code-command/pi) folder. - ## Convert text to speech -The same speech service you used to convert speech to text can be used to convert text back into speech, and this can be played through a microphone on your IoT device. +The same speech service you used to convert speech to text can be used to convert text back into speech, and this can be played through a speaker on your IoT device. The text to convert is sent to the speech service, along with the type of audio required (such as the sample rate), and binary data containing the audio is returned. + +When you send this request, you send it using *Speech Synthesis Markup Language* (SSML), an XML-based markup language for speech synthesis applications. This defines not only the text to be converted, but the language of the text, the voice to use, and can even be used to define speed, volume, and pitch for some or all of the words in the text. + +For example, this SSML defines a request to convert the text "Your 3 minute 5 second time has been set" to speech using a British English voice called `en-GB-MiaNeural` + +```xml + + + Your 3 minute 5 second time has been set + + +``` + +> 💁 Most text to speech systems have multiple voices for different languages, with relevant accents such as a British English voice with an English accent and a New Zealand English voice with a New Zealand accent. ### Task - convert text to speech @@ -78,12 +134,17 @@ Work through the relevant guide to convert text to speech using your IoT device: ## 🚀 Challenge +SSML has ways to change how words are spoken, such as adding emphasis to certain words, adding pauses, or changing pitch. Try some of these out, sending different SSML from your IoT device and comparing the output. You can read more about SSML, including how to change the way words are spoken in the [Speech Synthesis Markup Language (SSML) Version 1.1 specification from the World Wide Web consortium](https://www.w3.org/TR/speech-synthesis11/). + ## Post-lecture quiz [Post-lecture quiz](https://brave-island-0b7c7f50f.azurestaticapps.net/quiz/46) ## Review & Self Study +* Read more on speech synthesis on the [Speech synthesis page on Wikipedia](https://wikipedia.org/wiki/Speech_synthesis) +* Read more on ways criminals are using speech synthesis to steal on the [Fake voices 'help cyber crooks steal cash' story on BBC news](https://www.bbc.com/news/technology-48908736) + ## Assignment -[](assignment.md) +[Cancel the timer](assignment.md) diff --git a/6-consumer/lessons/3-spoken-feedback/assignment.md b/6-consumer/lessons/3-spoken-feedback/assignment.md index da157d5..efaad57 100644 --- a/6-consumer/lessons/3-spoken-feedback/assignment.md +++ b/6-consumer/lessons/3-spoken-feedback/assignment.md @@ -1,9 +1,12 @@ -# +# Cancel the timer ## Instructions +In the assignment for the last lesson, you added a cancel timer intent to LUIS. For this assignment you need to handle this intent in the serverless code, send a command to the IoT device, then cancel the timer. + ## Rubric | Criteria | Exemplary | Adequate | Needs Improvement | | -------- | --------- | -------- | ----------------- | -| | | | | +| Handle the intent in serverless code and send a command | Was able to handle the intent and send a command to the device | Was able to handle the intent but was unable to send the command to the device | Was unable to handle the intent | +| Cancel the timer on the device | Was able to receive the command and cancel the timer | Was able to receive the command but not cancel the timer | Was unable to receive the command | diff --git a/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/host.json b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/host.json new file mode 100644 index 0000000..291065f --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/host.json @@ -0,0 +1,15 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[2.*, 3.0.0)" + } +} \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/local.settings.json b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/local.settings.json new file mode 100644 index 0000000..8b5b956 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/local.settings.json @@ -0,0 +1,12 @@ +{ + "IsEncrypted": false, + "Values": { + "FUNCTIONS_WORKER_RUNTIME": "python", + "AzureWebJobsStorage": "UseDevelopmentStorage=true", + "IOT_HUB_CONNECTION_STRING": "", + "LUIS_KEY": "", + "LUIS_ENDPOINT_URL": "", + "LUIS_APP_ID": "", + "REGISTRY_MANAGER_CONNECTION_STRING": "" + } +} \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/requirements.txt b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/requirements.txt new file mode 100644 index 0000000..d0405a3 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/requirements.txt @@ -0,0 +1,4 @@ +# Do not include azure-functions-worker as it may conflict with the Azure Functions platform + +azure-functions +azure-cognitiveservices-language-luis \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/__init__.py b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/__init__.py new file mode 100644 index 0000000..be8e5ee --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/__init__.py @@ -0,0 +1,60 @@ +from typing import List +import logging + +import azure.functions as func + +import json +import os +from azure.cognitiveservices.language.luis.runtime import LUISRuntimeClient +from msrest.authentication import CognitiveServicesCredentials + +from azure.iot.hub import IoTHubRegistryManager +from azure.iot.hub.models import CloudToDeviceMethod + +def main(events: List[func.EventHubEvent]): + luis_key = os.environ['LUIS_KEY'] + endpoint_url = os.environ['LUIS_ENDPOINT_URL'] + app_id = os.environ['LUIS_APP_ID'] + registry_manager_connection_string = os.environ['REGISTRY_MANAGER_CONNECTION_STRING'] + + credentials = CognitiveServicesCredentials(luis_key) + client = LUISRuntimeClient(endpoint=endpoint_url, credentials=credentials) + + for event in events: + logging.info('Python EventHub trigger processed an event: %s', + event.get_body().decode('utf-8')) + + device_id = event.iothub_metadata['connection-device-id'] + + event_body = json.loads(event.get_body().decode('utf-8')) + prediction_request = { 'query' : event_body['speech'] } + + prediction_response = client.prediction.get_slot_prediction(app_id, 'Staging', prediction_request) + + if prediction_response.prediction.top_intent == 'set timer': + numbers = prediction_response.prediction.entities['number'] + time_units = prediction_response.prediction.entities['time unit'] + total_seconds = 0 + + for i in range(0, len(numbers)): + number = numbers[i] + time_unit = time_units[i][0] + + if time_unit == 'minute': + total_seconds += number * 60 + else: + total_seconds += number + + logging.info(f'Timer required for {total_seconds} seconds') + + payload = { + 'seconds': total_seconds + } + direct_method = CloudToDeviceMethod(method_name='set-timer', payload=json.dumps(payload)) + + registry_manager_connection_string = os.environ['REGISTRY_MANAGER_CONNECTION_STRING'] + registry_manager = IoTHubRegistryManager(registry_manager_connection_string) + + registry_manager.invoke_device_method(device_id, direct_method) + + diff --git a/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/function.json b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/function.json new file mode 100644 index 0000000..0117bdf --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-command/functions/smart-timer-trigger/speech-trigger/function.json @@ -0,0 +1,15 @@ +{ + "scriptFile": "__init__.py", + "bindings": [ + { + "type": "eventHubTrigger", + "name": "events", + "direction": "in", + "eventHubName": "samples-workitems", + "connection": "IOT_HUB_CONNECTION_STRING", + "cardinality": "many", + "consumerGroup": "$Default", + "dataType": "binary" + } + ] +} \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-spoken-response/pi/smart-timer/app.py b/6-consumer/lessons/3-spoken-feedback/code-spoken-response/pi/smart-timer/app.py new file mode 100644 index 0000000..40bce46 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-spoken-response/pi/smart-timer/app.py @@ -0,0 +1,184 @@ +import io +import json +import pyaudio +import requests +import time +import wave +import threading + +from azure.iot.device import IoTHubDeviceClient, Message, MethodResponse + +from grove.factory import Factory +button = Factory.getButton('GPIO-HIGH', 5) + +audio = pyaudio.PyAudio() +microphone_card_number = 1 +speaker_card_number = 1 +rate = 16000 + +def capture_audio(): + stream = audio.open(format = pyaudio.paInt16, + rate = rate, + channels = 1, + input_device_index = microphone_card_number, + input = True, + frames_per_buffer = 4096) + + frames = [] + + while button.is_pressed(): + frames.append(stream.read(4096)) + + stream.stop_stream() + stream.close() + + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wavefile: + wavefile.setnchannels(1) + wavefile.setsampwidth(audio.get_sample_size(pyaudio.paInt16)) + wavefile.setframerate(rate) + wavefile.writeframes(b''.join(frames)) + wav_buffer.seek(0) + + return wav_buffer + +api_key = '' +location = '' +language = '' +connection_string = '' + +device_client = IoTHubDeviceClient.create_from_connection_string(connection_string) + +print('Connecting') +device_client.connect() +print('Connected') + +def get_access_token(): + headers = { + 'Ocp-Apim-Subscription-Key': api_key + } + + token_endpoint = f'https://{location}.api.cognitive.microsoft.com/sts/v1.0/issuetoken' + response = requests.post(token_endpoint, headers=headers) + return str(response.text) + +def convert_speech_to_text(buffer): + url = f'https://{location}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1' + + headers = { + 'Authorization': 'Bearer ' + get_access_token(), + 'Content-Type': f'audio/wav; codecs=audio/pcm; samplerate={rate}', + 'Accept': 'application/json;text/xml' + } + + params = { + 'language': language + } + + response = requests.post(url, headers=headers, params=params, data=buffer) + response_json = json.loads(response.text) + + if response_json['RecognitionStatus'] == 'Success': + return response_json['DisplayText'] + else: + return '' + +def get_voice(): + url = f'https://{location}.tts.speech.microsoft.com/cognitiveservices/voices/list' + + headers = { + 'Authorization': 'Bearer ' + get_access_token() + } + + response = requests.get(url, headers=headers) + voices_json = json.loads(response.text) + + first_voice = next(x for x in voices_json if x['Locale'].lower() == language.lower()) + return first_voice['ShortName'] + +voice = get_voice() +print(f"Using voice {voice}") + +playback_format = 'riff-48khz-16bit-mono-pcm' + +def get_speech(text): + url = f'https://{location}.tts.speech.microsoft.com/cognitiveservices/v1' + + headers = { + 'Authorization': 'Bearer ' + get_access_token(), + 'Content-Type': 'application/ssml+xml', + 'X-Microsoft-OutputFormat': playback_format + } + + ssml = f'' + ssml += f'' + ssml += text + ssml += '' + ssml += '' + + response = requests.post(url, headers=headers, data=ssml.encode('utf-8')) + return io.BytesIO(response.content) + +def play_speech(speech): + with wave.open(speech, 'rb') as wave_file: + stream = audio.open(format=audio.get_format_from_width(wave_file.getsampwidth()), + channels=wave_file.getnchannels(), + rate=wave_file.getframerate(), + output_device_index=speaker_card_number, + output=True) + + data = wave_file.readframes(4096) + + while len(data) > 0: + stream.write(data) + data = wave_file.readframes(4096) + + stream.stop_stream() + stream.close() + +def say(text): + speech = get_speech(text) + play_speech(speech) + +def announce_timer(minutes, seconds): + announcement = 'Times up on your ' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer.' + say(announcement) + +def create_timer(total_seconds): + minutes, seconds = divmod(total_seconds, 60) + threading.Timer(total_seconds, announce_timer, args=[minutes, seconds]).start() + announcement = '' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer started.' + say(announcement) + +def handle_method_request(request): + if request.name == 'set-timer': + payload = json.loads(request.payload) + seconds = payload['seconds'] + if seconds > 0: + create_timer(payload['seconds']) + + method_response = MethodResponse.create_from_method_request(request, 200) + device_client.send_method_response(method_response) + +device_client.on_method_request_received = handle_method_request + +while True: + while not button.is_pressed(): + time.sleep(.1) + + buffer = capture_audio() + text = convert_speech_to_text(buffer) + if len(text) > 0: + print(text) + message = Message(json.dumps({ 'speech': text })) + device_client.send_message(message) \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-spoken-response/virtual-iot-device/smart-timer/app.py b/6-consumer/lessons/3-spoken-feedback/code-spoken-response/virtual-iot-device/smart-timer/app.py new file mode 100644 index 0000000..cd1a8fe --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-spoken-response/virtual-iot-device/smart-timer/app.py @@ -0,0 +1,86 @@ +import json +import threading +import time +from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer, SpeechSynthesizer +from azure.iot.device import IoTHubDeviceClient, Message, MethodResponse + +api_key = '' +location = '' +language = '' +connection_string = '' + +device_client = IoTHubDeviceClient.create_from_connection_string(connection_string) + +print('Connecting') +device_client.connect() +print('Connected') + +recognizer_config = SpeechConfig(subscription=api_key, + region=location, + speech_recognition_language=language) + +recognizer = SpeechRecognizer(speech_config=recognizer_config) + +def recognized(args): + if len(args.result.text) > 0: + message = Message(json.dumps({ 'speech': args.result.text })) + device_client.send_message(message) + +recognizer.recognized.connect(recognized) + +recognizer.start_continuous_recognition() + +speech_config = SpeechConfig(subscription=api_key, + region=location) +speech_config.speech_synthesis_language = language +speech_synthesizer = SpeechSynthesizer(speech_config=speech_config) + +voices = speech_synthesizer.get_voices_async().get().voices +first_voice = next(x for x in voices if x.locale.lower() == language.lower()) +speech_config.speech_synthesis_voice_name = first_voice.short_name + +def say(text): + ssml = f'' + ssml += f'' + ssml += text + ssml += '' + ssml += '' + + recognizer.stop_continuous_recognition() + speech_synthesizer.speak_ssml(ssml) + recognizer.start_continuous_recognition() + +def announce_timer(minutes, seconds): + announcement = 'Times up on your ' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer.' + say(announcement) + +def create_timer(total_seconds): + minutes, seconds = divmod(total_seconds, 60) + threading.Timer(total_seconds, announce_timer, args=[minutes, seconds]).start() + announcement = '' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer started.' + say(announcement) + +def handle_method_request(request): + if request.name == 'set-timer': + payload = json.loads(request.payload) + seconds = payload['seconds'] + if seconds > 0: + create_timer(payload['seconds']) + + method_response = MethodResponse.create_from_method_request(request, 200) + device_client.send_method_response(method_response) + +device_client.on_method_request_received = handle_method_request + +while True: + time.sleep(1) \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-timer/pi/smart-timer/app.py b/6-consumer/lessons/3-spoken-feedback/code-timer/pi/smart-timer/app.py new file mode 100644 index 0000000..1a8a622 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-timer/pi/smart-timer/app.py @@ -0,0 +1,130 @@ +import io +import json +import pyaudio +import requests +import time +import wave +import threading + +from azure.iot.device import IoTHubDeviceClient, Message, MethodResponse + +from grove.factory import Factory +button = Factory.getButton('GPIO-HIGH', 5) + +audio = pyaudio.PyAudio() +microphone_card_number = 1 +speaker_card_number = 1 +rate = 16000 + +def capture_audio(): + stream = audio.open(format = pyaudio.paInt16, + rate = rate, + channels = 1, + input_device_index = microphone_card_number, + input = True, + frames_per_buffer = 4096) + + frames = [] + + while button.is_pressed(): + frames.append(stream.read(4096)) + + stream.stop_stream() + stream.close() + + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wavefile: + wavefile.setnchannels(1) + wavefile.setsampwidth(audio.get_sample_size(pyaudio.paInt16)) + wavefile.setframerate(rate) + wavefile.writeframes(b''.join(frames)) + wav_buffer.seek(0) + + return wav_buffer + +api_key = '' +location = '' +language = '' +connection_string = '' + +device_client = IoTHubDeviceClient.create_from_connection_string(connection_string) + +print('Connecting') +device_client.connect() +print('Connected') + +def get_access_token(): + headers = { + 'Ocp-Apim-Subscription-Key': api_key + } + + token_endpoint = f'https://{location}.api.cognitive.microsoft.com/sts/v1.0/issuetoken' + response = requests.post(token_endpoint, headers=headers) + return str(response.text) + +def convert_speech_to_text(buffer): + url = f'https://{location}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1' + + headers = { + 'Authorization': 'Bearer ' + get_access_token(), + 'Content-Type': f'audio/wav; codecs=audio/pcm; samplerate={rate}', + 'Accept': 'application/json;text/xml' + } + + params = { + 'language': language + } + + response = requests.post(url, headers=headers, params=params, data=buffer) + response_json = json.loads(response.text) + + if response_json['RecognitionStatus'] == 'Success': + return response_json['DisplayText'] + else: + return '' + +def say(text): + print(text) + +def announce_timer(minutes, seconds): + announcement = 'Times up on your ' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer.' + say(announcement) + +def create_timer(total_seconds): + minutes, seconds = divmod(total_seconds, 60) + threading.Timer(total_seconds, announce_timer, args=[minutes, seconds]).start() + announcement = '' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer started.' + say(announcement) + +def handle_method_request(request): + if request.name == 'set-timer': + payload = json.loads(request.payload) + seconds = payload['seconds'] + if seconds > 0: + create_timer(payload['seconds']) + + method_response = MethodResponse.create_from_method_request(request, 200) + device_client.send_method_response(method_response) + +device_client.on_method_request_received = handle_method_request + +while True: + while not button.is_pressed(): + time.sleep(.1) + + buffer = capture_audio() + text = convert_speech_to_text(buffer) + if len(text) > 0: + print(text) + message = Message(json.dumps({ 'speech': text })) + device_client.send_message(message) \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/code-timer/virtual-iot-device/smart-timer/app.py b/6-consumer/lessons/3-spoken-feedback/code-timer/virtual-iot-device/smart-timer/app.py new file mode 100644 index 0000000..f6f8ed0 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/code-timer/virtual-iot-device/smart-timer/app.py @@ -0,0 +1,69 @@ +import json +import threading +import time +from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer +from azure.iot.device import IoTHubDeviceClient, Message, MethodResponse + +api_key = '' +location = '' +language = '' +connection_string = '' + +device_client = IoTHubDeviceClient.create_from_connection_string(connection_string) + +print('Connecting') +device_client.connect() +print('Connected') + +recognizer_config = SpeechConfig(subscription=api_key, + region=location, + speech_recognition_language=language) + +recognizer = SpeechRecognizer(speech_config=recognizer_config) + +def recognized(args): + if len(args.result.text) > 0: + message = Message(json.dumps({ 'speech': args.result.text })) + device_client.send_message(message) + +recognizer.recognized.connect(recognized) + +recognizer.start_continuous_recognition() + +def say(text): + print(text) + +def announce_timer(minutes, seconds): + announcement = 'Times up on your ' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer.' + say(announcement) + +def create_timer(total_seconds): + minutes, seconds = divmod(total_seconds, 60) + threading.Timer(total_seconds, announce_timer, args=[minutes, seconds]).start() + announcement = '' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer started.' + say(announcement) + +def handle_method_request(request): + if request.name == 'set-timer': + payload = json.loads(request.payload) + seconds = payload['seconds'] + if seconds > 0: + create_timer(payload['seconds']) + + method_response = MethodResponse.create_from_method_request(request, 200) + device_client.send_method_response(method_response) + +device_client.on_method_request_received = handle_method_request + +while True: + time.sleep(1) \ No newline at end of file diff --git a/6-consumer/lessons/3-spoken-feedback/pi-text-to-speech.md b/6-consumer/lessons/3-spoken-feedback/pi-text-to-speech.md new file mode 100644 index 0000000..2c961e3 --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/pi-text-to-speech.md @@ -0,0 +1,140 @@ +# Text to speech - Raspberry Pi + +In this part of the lesson, you will write code to convert text to speech using the speech service. + +## Convert text to speech using the speech service + +The text can be sent to the speech service using the REST API to get speech as an audio file that can be played back on your IoT device. When requesting speech, you need to provide the voice to use as speech can be generated using a variety of different voices. + +Each language supports a range of different voices, and you can make a REST request against the speech service to get the list of supported voices for each language. + +### Task - get a voice + +1. Add the following code above the `say` function to request the list of voices for a language: + + ```python + def get_voice(): + url = f'https://{location}.tts.speech.microsoft.com/cognitiveservices/voices/list' + + headers = { + 'Authorization': 'Bearer ' + get_access_token() + } + + response = requests.get(url, headers=headers) + voices_json = json.loads(response.text) + + first_voice = next(x for x in voices_json if x['Locale'].lower() == language.lower() and x['VoiceType'] == 'Neural') + return first_voice['ShortName'] + + voice = get_voice() + print(f"Using voice {voice}") + ``` + + This code defines a function called `get_voice` that uses the speech service to get a list of voices. It then finds the first voice that matches the language that is being used. + + This function is then called to store the first voice, and the voice name is printed to the console. This voice can be requested once and the value used for every call to convert text to speech. + + > 💁 You can get the full list of supported voices from the [Language and voice support documentation on Microsoft Docs](https://docs.microsoft.com/azure/cognitive-services/speech-service/language-support?WT.mc_id=academic-17441-jabenn#text-to-speech). If you want to use a specific voice, then you can remove this function and hard code the voice to the voice name from this documentation. For example: + > + > ```python + > voice = 'hi-IN-SwaraNeural' + > ``` + +### Task - convert text to speech + +1. Below this, define a constant for the audio format to be retrieved from the speech services. When you request audio, you can do it in a range of different formats. + + ```python + playback_format = 'riff-48khz-16bit-mono-pcm' + ``` + + The format you can use depends on your hardware. If you get `Invalid sample rate` errors when playing the audio then change this to another value. You can find the list of supported values in the [Text to speech REST API documentation on Microsoft Docs](https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech?WT.mc_id=academic-17441-jabenn#audio-outputs). You will need to use `riff` format audio, and the values to try are `riff-16khz-16bit-mono-pcm`, `riff-24khz-16bit-mono-pcm` and `riff-48khz-16bit-mono-pcm`. + +1. Below this, declare a function called `get_speech` that will convert the text to speech using the speech service REST API: + + ```python + def get_speech(text): + ``` + +1. In the `get_speech` function, define the URL to call and the headers to pass: + + ```python + url = f'https://{location}.tts.speech.microsoft.com/cognitiveservices/v1' + + headers = { + 'Authorization': 'Bearer ' + get_access_token(), + 'Content-Type': 'application/ssml+xml', + 'X-Microsoft-OutputFormat': playback_format + } + ``` + + This set the headers to use a generated access token, set the content to SSML and define the audio format needed. + +1. Below this, define the SSML to send to the REST API: + + ```python + ssml = f'' + ssml += f'' + ssml += text + ssml += '' + ssml += '' + ``` + + This SSML sets the language and the voice to use, along with the text to convert. + +1. Finally, add code in this function to make the REST request and return the binary audio data: + + ```python + response = requests.post(url, headers=headers, data=ssml.encode('utf-8')) + return io.BytesIO(response.content) + ``` + +### Task - play the audio + +1. Below the `get_speech` function, define a new function to play the audio returned by the REST API call: + + ```python + def play_speech(speech): + ``` + +1. The `speech` passed to this function will be the binary audio data returned from the REST API. Use the following code to open this as a wave file and pass it to PyAudio to play the audio: + + ```python + def play_speech(speech): + with wave.open(speech, 'rb') as wave_file: + stream = audio.open(format=audio.get_format_from_width(wave_file.getsampwidth()), + channels=wave_file.getnchannels(), + rate=wave_file.getframerate(), + output_device_index=speaker_card_number, + output=True) + + data = wave_file.readframes(4096) + + while len(data) > 0: + stream.write(data) + data = wave_file.readframes(4096) + + stream.stop_stream() + stream.close() + ``` + + This code uses a PyAudio stream, the same as capturing audio. The difference here is the stream is set as an output stream, and data is read from the audio data and pushed to the stream. + + Rather than hard coding the stream details such as the sample rate, it is read from the audio data. + +1. Replace the contents of the `say` function to the following: + + ```python + speech = get_speech(text) + play_speech(speech) + ``` + + This code converts the text to speech as binary audio data, and plays the audio. + +1. Run the app, and ensure the function app is also running. Set some timers, and you will hear a spoken response saying that your timer has been set, then another spoken response when the timer is complete. + + If you get `Invalid sample rate` errors, change the `playback_format` as described above. + +> 💁 You can find this code in the [code-spoken-response/pi](code-spoken-response/pi) folder. + +😀 Your timer program was a success! diff --git a/6-consumer/lessons/3-spoken-feedback/single-board-computer-set-timer.md b/6-consumer/lessons/3-spoken-feedback/single-board-computer-set-timer.md new file mode 100644 index 0000000..efa3b9e --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/single-board-computer-set-timer.md @@ -0,0 +1,97 @@ +# Set a timer - Virtual IoT Hardware and Raspberry Pi + +In this part of the lesson, you will set a timer on your virtual IoT device or Raspberry Pi based off a command from the IoT Hub. + +## Set a timer + +The command sent from the serverless function contains the time for the timer in seconds as the payload. This time can be used to set a timer. + +Timers can be set using the Python `threading.Timer` class. This class takes a delay time and a function, and after the delay time, the function is executed. + +### Task - set a timer + +1. Open the `smart-timer` project in VS Code, and ensure the virtual environment is loaded in the terminal if you are using a virtual IoT device. + +1. Add the following import statement at the top of the file to import the threading Python library: + + ```python + import threading + ``` + +1. Above the `handle_method_request` function that handles the method request, add a function to speak a response. Fow now this will just write to the console, but later in this lesson this will speak the text. + + ```python + def say(text): + print(text) + ``` + +1. Below this add a function that will be called by a timer to announce that the timer is complete: + + ```python + def announce_timer(minutes, seconds): + announcement = 'Times up on your ' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer.' + say(announcement) + ``` + + This function takes the number of minutes and seconds for the timer, and builds a sentence to say that the timer is complete. It will check the number of minutes and seconds, and only include each time unit if it has a number. For example, if the number of minutes is 0 then only seconds are included in the message. This sentence is then sent to the `say` function. + +1. Below this, add the following `create_timer` function to create a timer: + + ```python + def create_timer(seconds): + minutes, seconds = divmod(seconds, 60) + threading.Timer(seconds, announce_timer, args=[minutes, seconds]).start() + ``` + + This function takes the total number of seconds for the timer that will be sent in the command, and converts this to minutes and seconds. It then creates and starts a timer object using the total number of seconds, passing in the `announce_timer` function and a list containing the minutes and seconds. When the timer elapses, it will call the `announce_timer` function, and pass the contents of this list as the parameters - so the first item in the list gets passes as the `minutes` parameter, and the second item as the `seconds` parameter. + +1. To the end of the `create_timer` function, add some code to build a message to be spoken to the user to announce that the timer is starting: + + ```python + announcement = '' + if minutes > 0: + announcement += f'{minutes} minute' + if seconds > 0: + announcement += f'{seconds} second' + announcement += ' timer started.' + say(announcement) + ``` + + Again, this only includes the time unit that has a value. This sentence is then sent to the `say` function. + +1. At the start of the `handle_method_request` function, add the following code to check that the `set-timer` direct method was requested: + + ```python + if request.name == 'set-timer': + ``` + +1. Inside this `if` statement, extract the timer time in seconds from the payload and use this to create a timer: + + ```python + payload = json.loads(request.payload) + seconds = payload['seconds'] + if seconds > 0: + create_timer(payload['seconds']) + ``` + + The timer is only created if the number of seconds is greater than 0 + +1. Run the app, and ensure the function app is also running. Set some timers, and the output will show the timer being set, and then will show when it elapses: + + ```output + pi@raspberrypi:~/smart-timer $ python3 app.py + Connecting + Connected + Set a one minute 4 second timer. + 1 minute, 4 second timer started + Times up on your 1 minute, 4 second timer + ``` + +> 💁 You can find this code in the [code-timer/pi](code-timer/pi) or [code-timer/virtual-iot-device](code-timer/virtual-iot-device) folder. + +😀 Your timer program was a success! diff --git a/6-consumer/lessons/3-spoken-feedback/virtual-device-text-to-speech.md b/6-consumer/lessons/3-spoken-feedback/virtual-device-text-to-speech.md new file mode 100644 index 0000000..df71c4a --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/virtual-device-text-to-speech.md @@ -0,0 +1,72 @@ +# Text to speech - Virtual IoT device + +In this part of the lesson, you will write code to convert text to speech using the speech service. + +## Convert text to speech + +The speech services SDK that you used in the last lesson to convert speech to text can be used to convert text back to speech. When requesting speech, you need to provide the voice to use as speech can be generated using a variety of different voices. + +Each language supports a range of different voices, and you can get the list of supported voices for each language from the speech services SDK. + +### Task - convert text to speech + +1. Import the `SpeechSynthesizer` from the `azure.cognitiveservices.speech` package by adding it to the existing imports: + + ```python + from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer, SpeechSynthesizer + ``` + +1. Above the `say` function, create a speech configuration to use with the speech synthesizer: + + ```python + speech_config = SpeechConfig(subscription=api_key, + region=location) + speech_config.speech_synthesis_language = language + speech_synthesizer = SpeechSynthesizer(speech_config=speech_config) + ``` + + This uses the same API key, location and language that was used by the recognizer. + +1. Below this, add the following code to get a voice and set it on the speech config: + + ```python + voices = speech_synthesizer.get_voices_async().get().voices + first_voice = next(x for x in voices if x.locale.lower() == language.lower()) + speech_config.speech_synthesis_voice_name = first_voice.short_name + ``` + + This retrieves a list of all the available voices, then finds the first voice that matches the language that is being used. + + > 💁 You can get the full list of supported voices from the [Language and voice support documentation on Microsoft Docs](https://docs.microsoft.com/azure/cognitive-services/speech-service/language-support?WT.mc_id=academic-17441-jabenn#text-to-speech). If you want to use a specific voice, then you can remove this function and hard code the voice to the voice name from this documentation. For example: + > + > ```python + > speech_config.speech_synthesis_voice_name = 'hi-IN-SwaraNeural' + > ``` + +1. Update the contents of the `say` function to generate SSML for the response: + + ```python + ssml = f'' + ssml += f'' + ssml += text + ssml += '' + ssml += '' + ``` + +1. Below this, stop the speech recognition, speak the SSML, then start the recognition again: + + ```python + recognizer.stop_continuous_recognition() + speech_synthesizer.speak_ssml(ssml) + recognizer.start_continuous_recognition() + ``` + + The recognition is stopped whilst the text is spoken to avoid the announcement of the timer starting being detected, sent to LUIS and possibly interpreted as a request to set a new timer. + + > 💁 You can test this out by commenting out the lines to stop and restart the recognition. Set one timer, and you may find the announcement sets a new timer, which causes a new announcement, leading to a new timer, and so on for ever! + +1. Run the app, and ensure the function app is also running. Set some timers, and you will hear a spoken response saying that your timer has been set, then another spoken response when the timer is complete. + +> 💁 You can find this code in the [code-spoken-response/virtual-iot-device](code-spoken-response/virtual-iot-device) folder. + +😀 Your timer program was a success! diff --git a/6-consumer/lessons/3-spoken-feedback/wio-terminal-set-timer.md b/6-consumer/lessons/3-spoken-feedback/wio-terminal-set-timer.md new file mode 100644 index 0000000..2e8910e --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/wio-terminal-set-timer.md @@ -0,0 +1,3 @@ +# Set a timer - Wio Terminal + +Coming soon diff --git a/6-consumer/lessons/3-spoken-feedback/wio-terminal-text-to-speech.md b/6-consumer/lessons/3-spoken-feedback/wio-terminal-text-to-speech.md new file mode 100644 index 0000000..e27369e --- /dev/null +++ b/6-consumer/lessons/3-spoken-feedback/wio-terminal-text-to-speech.md @@ -0,0 +1,3 @@ +# Text to speech - Wio Terminal + +Coming soon diff --git a/images/Diagrams.sketch b/images/Diagrams.sketch index c298bcf..468d852 100644 Binary files a/images/Diagrams.sketch and b/images/Diagrams.sketch differ diff --git a/images/tts-overview.png b/images/tts-overview.png new file mode 100644 index 0000000..d418127 Binary files /dev/null and b/images/tts-overview.png differ