From 103bf0061b6bdc34e8d3ea805bda75475d5b1398 Mon Sep 17 00:00:00 2001 From: Ian Lavery Date: Thu, 30 Nov 2023 15:57:32 -0800 Subject: [PATCH] readme updates (#338) --- .github/workflows/c-demos.yml | 4 +- .github/workflows/react-native-tests.yml | 6 +- README.md | 121 +++++++++++++---------- binding/android/README.md | 31 ++++-- binding/dotnet/README.md | 19 +++- binding/flutter/README.md | 57 ++++++----- binding/go/README.md | 20 +++- binding/ios/README.md | 13 ++- binding/java/README.md | 22 ++++- binding/nodejs/README.md | 27 +++-- binding/python/README.md | 19 +++- binding/react-native/README.md | 33 ++++--- binding/react/README.md | 17 ++-- binding/rust/README.md | 20 +++- binding/web/README.md | 65 ++++++------ demo/c/README.md | 20 ++-- demo/dotnet/README.md | 10 +- demo/go-grpc/README.md | 4 +- demo/go/README.md | 8 +- demo/java/README.md | 2 +- demo/nodejs/README.md | 4 +- demo/python-subtitle/README.md | 2 +- demo/python/README.md | 8 +- demo/rust/README.md | 4 +- 24 files changed, 335 insertions(+), 201 deletions(-) diff --git a/.github/workflows/c-demos.yml b/.github/workflows/c-demos.yml index e57239c1..c5d78ae2 100644 --- a/.github/workflows/c-demos.yml +++ b/.github/workflows/c-demos.yml @@ -5,9 +5,9 @@ on: push: branches: [ master ] paths: - - '!demo/c/README.md' - '.github/workflows/c-demos.yml' - 'demo/c/**' + - '!demo/c/README.md' - 'include/**' - 'lib/common/**' - 'lib/jetson/**' @@ -20,9 +20,9 @@ on: pull_request: branches: [ master, 'v[0-9]+.[0-9]+' ] paths: - - '!demo/c/README.md' - '.github/workflows/c-demos.yml' - 'demo/c/**' + - '!demo/c/README.md' - 'include/**' - 'lib/common/**' - 'lib/jetson/**' diff --git a/.github/workflows/react-native-tests.yml b/.github/workflows/react-native-tests.yml index 47fc1440..ee997680 100644 --- a/.github/workflows/react-native-tests.yml +++ b/.github/workflows/react-native-tests.yml @@ -4,17 +4,19 @@ on: push: branches: [ master ] paths: + - '.github/workflows/react-native-tests.yml' - 'binding/react-native/**' + - '!binding/react-native/README.md' - 'lib/common/**' - - '.github/workflows/react-native-tests.yml' - 'resources/audio_samples/**' - 'resources/.test/**' pull_request: branches: [ master, 'v[0-9]+.[0-9]+' ] paths: + - '.github/workflows/react-native-tests.yml' - 'binding/react-native/**' + - '!binding/react-native/README.md' - 'lib/common/**' - - '.github/workflows/react-native-tests.yml' - 'resources/audio_samples/**' - 'resources/.test/**' diff --git a/README.md b/README.md index 021984d6..499d0616 100644 --- a/README.md +++ b/README.md @@ -79,10 +79,10 @@ pip3 install pvleoparddemo Run the following in the terminal: ```bash -leopard_demo_file --access_key ${ACCESS_KEY} --audio_paths ${AUDIO_PATH} +leopard_demo_file --access_key ${ACCESS_KEY} --audio_paths ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### C Demo @@ -96,12 +96,12 @@ cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build Run the demo: ```console -./demo/c/build/leopard_demo -a ${ACCESS_KEY} -l ${LIBRARY_PATH} -m ${MODEL_PATH} ${AUDIO_PATH} +./demo/c/build/leopard_demo -a ${ACCESS_KEY} -l ${LIBRARY_PATH} -m ${MODEL_FILE_PATH} ${AUDIO_FILE_PATH} ``` Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${LIBRARY_PATH}` with the path to appropriate -library under [lib](/lib), `${MODEL_PATH}` to path to [default model file](./lib/common/leopard_params.pv) -(or your custom one), and `${AUDIO_PATH}` with a path to an audio file you wish to transcribe. +library under [lib](/lib), `${MODEL_FILE_PATH}` to path to [default model file](./lib/common/leopard_params.pv) +(or your custom one), and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### iOS Demo @@ -132,10 +132,10 @@ yarn global add @picovoice/leopard-node-demo Run the following in the terminal: ```console -leopard-file-demo --access_key ${ACCESS_KEY} --input_audio_file_path ${AUDIO_PATH} +leopard-file-demo --access_key ${ACCESS_KEY} --input_audio_file_path ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. For more information about Node.js demos go to [demo/nodejs](./demo/nodejs). @@ -165,10 +165,10 @@ The demo requires `cgo`, which on Windows may mean that you need to install a gc From [demo/go](./demo/go) run the following command from the terminal to build and run the file demo: ```console -go run filedemo/leopard_file_demo.go -access_key "${ACCESS_KEY}" -input_audio_path "${AUDIO_PATH}" +go run filedemo/leopard_file_demo.go -access_key "${ACCESS_KEY}" -input_audio_path "${AUDIO_FILE_PATH}" ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you wish to transcribe. +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. For more information about Go demos go to [demo/go](./demo/go). @@ -202,10 +202,10 @@ From [demo/java](./demo/java) run the following commands from the terminal to bu cd demo/java ./gradlew build cd build/libs -java -jar leopard-file-demo.jar -a ${ACCESS_KEY} -i ${AUDIO_PATH} +java -jar leopard-file-demo.jar -a ${ACCESS_KEY} -i ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you wish to transcribe. +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. For more information about Java demos go to [demo/java](./demo/java). @@ -217,10 +217,10 @@ file or on real-time microphone input. From [demo/dotnet/LeopardDemo](./demo/dotnet/LeopardDemo) run the following in the terminal: ```console -dotnet run -c FileDemo.Release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_PATH} +dotnet run -c FileDemo.Release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. For more information about .NET demos, go to [demo/dotnet](./demo/dotnet). @@ -233,10 +233,10 @@ file or on real-time microphone input. From [demo/rust/filedemo](./demo/rust/filedemo) run the following in the terminal: ```console -cargo run --release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_PATH} +cargo run --release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. For more information about Rust demos, go to [demo/rust](./demo/rust). @@ -294,14 +294,14 @@ Create an instance of the engine and transcribe an audio file: ```python import pvleopard -handle = pvleopard.create(access_key='${ACCESS_KEY}') +leopard = pvleopard.create(access_key='${ACCESS_KEY}') -print(handle.process_file('${AUDIO_PATH}')) +print(leopard.process_file('${AUDIO_FILE_PATH}')) ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to path an audio file. Finally, when done be sure to explicitly release the resources using -`handle.delete()`. +`${AUDIO_FILE_PATH}` to path an audio file. Finally, when done be sure to explicitly release the resources using +`leopard.delete()`. ### C @@ -314,9 +314,16 @@ Create an instance of the engine and transcribe an audio file: #include "pv_leopard.h" -pv_leopard_t *handle = NULL; -bool automatic_punctuation = false; -pv_status_t status = pv_leopard_init("${ACCESS_KEY}", "${MODEL_PATH}", automatic_punctuation, &handle); +pv_leopard_t *leopard = NULL; +bool enable_automatic_punctuation = false; +bool enable_speaker_diarization = false; + +pv_status_t status = pv_leopard_init( + "${ACCESS_KEY}", + "${MODEL_FILE_PATH}", + enable_automatic_punctuation, + enable_speaker_diarization, + &leopard); if (status != PV_STATUS_SUCCESS) { // error handling logic } @@ -324,7 +331,12 @@ if (status != PV_STATUS_SUCCESS) { char *transcript = NULL; int32_t num_words = 0; pv_word_t *words = NULL; -status = pv_leopard_process_file(handle, "${AUDIO_PATH}", &transcript, &num_words, &words); +status = pv_leopard_process_file( + leopard, + "${AUDIO_FILE_PATH}", + &transcript, + &num_words, + &words); if (status != PV_STATUS_SUCCESS) { // error handling logic } @@ -333,20 +345,21 @@ fprintf(stdout, "%s\n", transcript); for (int32_t i = 0; i < num_words; i++) { fprintf( stdout, - "[%s]\t.start_sec = %.1f .end_sec = %.1f .confidence = %.2f\n", + "[%s]\t.start_sec = %.1f .end_sec = %.1f .confidence = %.2f .speaker_tag = %d\n", words[i].word, words[i].start_sec, words[i].end_sec, - words[i].confidence); + words[i].confidence, + words[i].speaker_tag); } pv_leopard_transcript_delete(transcript); pv_leopard_words_delete(words); ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_PATH}` to path to -[default model file](./lib/common/leopard_params.pv) (or your custom one), and `${AUDIO_PATH}` to path an audio file. -Finally, when done be sure to release resources acquired using `pv_leopard_delete(handle)`. +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_FILE_PATH}` to path to +[default model file](./lib/common/leopard_params.pv) (or your custom one), and `${AUDIO_FILE_PATH}` to path an audio file. +Finally, when done be sure to release resources acquired using `pv_leopard_delete(leopard)`. ### iOS @@ -393,20 +406,20 @@ Create an instance of the engine and transcribe an audio file: import ai.picovoice.leopard.*; final String accessKey = "${ACCESS_KEY}"; // AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) -final String modelPath = "${MODEL_FILE}"; +final String modelPath = "${MODEL_FILE_PATH}"; try { - Leopard handle = new Leopard.Builder() + Leopard leopard = new Leopard.Builder() .setAccessKey(accessKey) .setModelPath(modelPath) .build(appContext); File audioFile = new File("${AUDIO_FILE_PATH}"); - LeopardTranscript transcript = handle.processFile(audioFile.getAbsolutePath()); + LeopardTranscript transcript = leopard.processFile(audioFile.getAbsolutePath()); } catch (LeopardException ex) { } ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_FILE}` with a custom trained model from [console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv), and `${AUDIO_FILE_PATH}` with the path to the audio file. +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_FILE_PATH}` with a custom trained model from [console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv), and `${AUDIO_FILE_PATH}` with the path to the audio file. ### Node.js @@ -421,19 +434,19 @@ Create instances of the Leopard class: ```javascript const Leopard = require("@picovoice/leopard-node"); const accessKey = "${ACCESS_KEY}" // Obtained from the Picovoice Console (https://console.picovoice.ai/) -let handle = new Leopard(accessKey); +let leopard = new Leopard(accessKey); -const result = engineInstance.processFile('${AUDIO_PATH}'); +const result = engineInstance.processFile('${AUDIO_FILE_PATH}'); console.log(result.transcript); ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to path an audio file. +`${AUDIO_FILE_PATH}` to path an audio file. When done, be sure to release resources using `release()`: ```javascript -handle.release(); +leopard.release(); ``` ### Flutter @@ -450,29 +463,29 @@ Create an instance of the engine and transcribe an audio file: ```dart import 'package:leopard/leopard.dart'; -const accessKey = "{ACCESS_KEY}" // AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) +final String accessKey = '{ACCESS_KEY}' // AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) try { - Leopard _leopard = await Leopard.create(accessKey, '{LEOPARD_MODEL_PATH}'); + Leopard _leopard = await Leopard.create(accessKey, '{MODEL_FILE_PATH}'); LeopardTranscript result = await _leopard.processFile("${AUDIO_FILE_PATH}"); print(result.transcript); } on LeopardException catch (err) { } ``` -Replace `${ACCESS_KEY}` with your `AccessKey` obtained from [Picovoice Console](https://console.picovoice.ai/), `${MODEL_FILE}` with a custom trained model from [Picovoice Console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv), and `${AUDIO_FILE_PATH}` with the path to the audio file. +Replace `${ACCESS_KEY}` with your `AccessKey` obtained from [Picovoice Console](https://console.picovoice.ai/), `${MODEL_FILE_PATH}` with a custom trained model from [Picovoice Console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv), and `${AUDIO_FILE_PATH}` with the path to the audio file. ### Go Install the Go binding: ```console -go get github.com/Picovoice/leopard/binding/go +go get github.com/Picovoice/leopard/binding/go/v2 ``` Create an instance of the engine and transcribe an audio file: ```go -import . "github.com/Picovoice/leopard/binding/go" +import . "github.com/Picovoice/leopard/binding/go/v2" leopard = Leopard{AccessKey: "${ACCESS_KEY}"} err := leopard.Init() @@ -481,7 +494,7 @@ if err != nil { } defer leopard.Delete() -transcript, words, err := leopard.ProcessFile("${AUDIO_PATH}") +transcript, words, err := leopard.ProcessFile("${AUDIO_FILE_PATH}") if err != nil { // handle process error } @@ -490,7 +503,7 @@ log.Println(transcript) ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to path an audio file. Finally, when done be sure to explicitly release the resources using +`${AUDIO_FILE_PATH}` to path an audio file. Finally, when done be sure to explicitly release the resources using `leopard.Delete()`. ### React Native @@ -511,7 +524,7 @@ const getAudioFrame = () => { } try { - const leopard = await Leopard.create("${ACCESS_KEY}", "${MODEL_FILE}") + const leopard = await Leopard.create("${ACCESS_KEY}", "${MODEL_FILE_PATH}") const { transcript, words } = await leopard.processFile("${AUDIO_FILE_PATH}") console.log(transcript) } catch (err: any) { @@ -521,7 +534,7 @@ try { } ``` -Replace `${ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console, `${MODEL_FILE}` with a custom trained model from [Picovoice Console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv) and `${AUDIO_FILE_PATH}` with the absolute path of the audio file. +Replace `${ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console, `${MODEL_FILE_PATH}` with a custom trained model from [Picovoice Console](https://console.picovoice.ai/) or the [default model](./lib/common/leopard_params.pv) and `${AUDIO_FILE_PATH}` with the absolute path of the audio file. When done be sure to explicitly release the resources using `leopard.delete()`. ### Java @@ -541,14 +554,14 @@ final String accessKey = "${ACCESS_KEY}"; try { Leopard leopard = new Leopard.Builder().setAccessKey(accessKey).build(); - LeopardTranscript result = leopard.processFile("${AUDIO_PATH}"); + LeopardTranscript result = leopard.processFile("${AUDIO_FILE_PATH}"); leopard.delete(); } catch (LeopardException ex) { } -System.out.println(transcript); +System.out.println(result.getTranscriptString()); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using `leopard.delete()`. +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_FILE_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using `leopard.delete()`. ### .NET @@ -564,14 +577,14 @@ Create an instance of the engine and transcribe an audio file: using Pv; const string accessKey = "${ACCESS_KEY}"; -const string audioPath = "/absolute/path/to/audio_file"; +const string audioPath = "${AUDIO_FILE_PATH}"; -Leopard handle = Leopard.Create(accessKey); +Leopard leopard = Leopard.Create(accessKey); -Console.Write(handle.ProcessFile(audioPath)); +Console.Write(leopard.ProcessFile(audioPath)); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done release the resources using `handle.Dispose()`. +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done release the resources using `leopard.Dispose()`. ### Rust @@ -698,8 +711,10 @@ function App(props) { ## Releases -### v2.0.0 - November 27th, 2023 +### v2.0.0 - November 30th, 2023 +- Added speaker diarization feature +- Added React SDK - Improvements to error reporting - Upgrades to authorization and authentication system - Improved engine accuracy diff --git a/binding/android/README.md b/binding/android/README.md index 5ec0e176..3d97126b 100644 --- a/binding/android/README.md +++ b/binding/android/README.md @@ -40,20 +40,15 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Usage -Add the Leopard model file to your Android application by: - -1. Either create a model in [Picovoice Console](https://console.picovoice.ai/) or use one of the default language models found in [lib/common](../../lib/common). -2. Add the model as a bundled resource by placing it under the assets directory of your Android project (`src/main/assets/`). - Create an instance of the engine with the Leopard Builder class by passing in the `accessKey`, `modelPath` and Android app context: ```java import ai.picovoice.leopard.*; final String accessKey = "${ACCESS_KEY}"; // AccessKey provided by Picovoice Console (https://console.picovoice.ai/) -final String modelPath = "${MODEL_PATH}"; // path relative to the assets folder or absolute path to file on device +final String modelPath = "${MODEL_FILE_PATH}"; // path relative to the assets folder or absolute path to file on device try { - Leopard handle = new Leopard.Builder() + Leopard leopard = new Leopard.Builder() .setAccessKey(accessKey) .setModelPath(modelPath) .build(appContext); @@ -64,7 +59,7 @@ Transcribe an audio file by providing the absolute path to the file: ```java File audioFile = new File("${AUDIO_FILE_PATH}"); -LeopardTranscript transcript = handle.processFile(audioFile.getAbsolutePath()); +LeopardTranscript transcript = leopard.processFile(audioFile.getAbsolutePath()); ``` Supported audio file formats are `3gp (AMR)`, `FLAC`, `MP3`, `MP4/m4a (AAC)`, `Ogg`, `WAV` and `WebM`. @@ -74,15 +69,31 @@ Transcribe raw audio data (sample rate of 16 kHz, 16-bit linearly encoded and 1 short[] getAudioData() { // ... } -LeopardTranscript transcript = handle.process(getAudioData()); +LeopardTranscript transcript = leopard.process(getAudioData()); ``` When done, release resources explicitly: ```java -handle.delete(); +leopard.delete(); ``` +### Language Model + +Add the Leopard model file to your Android application by: + +1. Either create a model in [Picovoice Console](https://console.picovoice.ai/) or use one of the default language models found in [lib/common](../../lib/common). +2. Add the model as a bundled resource by placing it under the assets directory of your Android project (`src/main/assets/`). + +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demo App For example usage, refer to our [Android demo application](../../demo/android). diff --git a/binding/dotnet/README.md b/binding/dotnet/README.md index 94656407..ff049b2f 100644 --- a/binding/dotnet/README.md +++ b/binding/dotnet/README.md @@ -64,16 +64,16 @@ Create an instance of the engine and transcribe an audio file: using Pv; const string accessKey = "${ACCESS_KEY}"; -const string audioPath = "/absolute/path/to/audio_file"; +const string audioPath = "${AUDIO_FILE_PATH}"; Leopard leopard = Leopard.Create(accessKey); LeopardTranscript result = leopard.ProcessFile(audioPath); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done release the resources using `handle.Dispose()`. +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done release the resources using `leopard.Dispose()`. -## Language Model +### Language Model The Leopard .NET SDK comes preloaded with a default English language model (`.pv` file). Default models for other supported languages can be found in [lib/common](../../lib/common). @@ -83,9 +83,20 @@ language models with custom vocabulary and boost words in the existing vocabular Pass in the `.pv` file via the `modelPath` argument in the `Create()` constructor: ```csharp -Leopard handle = Leopard.Create(accessKey, modelPath); +const string modelPath = "${MODEL_FILE_PATH}"; + +Leopard leopard = Leopard.Create(accessKey, modelPath); ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demos The [Leopard dotnet demo project](https://github.com/Picovoice/leopard/tree/master/demo/dotnet) is a .NET Core console app that allows for processing real-time audio (i.e. microphone) and files using Leopard. diff --git a/binding/flutter/README.md b/binding/flutter/README.md index 2924e4d0..48aa4ad4 100644 --- a/binding/flutter/README.md +++ b/binding/flutter/README.md @@ -54,25 +54,6 @@ On Android, open your AndroidManifest.xml and add the following line: ``` -## Leopard Model File Integration - -Add the Leopard model file to your Flutter application: - -1. Create a model in [Picovoice Console](https://console.picovoice.ai/) or use one of the [default language models](https://github.com/Picovoice/leopard/tree/master/lib/common). -2. Add the model file to an `assets` folder in your project directory. -3. Add the asset to your `pubspec.yaml`: -```yaml -flutter: - assets: - - assets/leopard_model.pv -``` -4. In this example, the path to the model file in code would then be as follows: -```dart -String modelPath = "assets/leopard_model.pv"; -``` - -Alternatively, if the model file is deployed to the device with a different method, the absolute path to the file on device can be used. - ## Usage An instance of [`Leopard`](https://picovoice.ai/docs/api/leopard-flutter/#leopard) is created by passing a model file path into its static constructor `create`: @@ -81,7 +62,7 @@ An instance of [`Leopard`](https://picovoice.ai/docs/api/leopard-flutter/#leopar import 'package:leopard_flutter/leopard.dart'; String accessKey = '{ACCESS_KEY}' // AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) -String modelPath = '{LEOPARD_MODEL_PATH}' // path relative to the assets folder or absolute path to file on device +String modelPath = '{MODEL_FILE_PATH}' // path relative to the assets folder or absolute path to file on device void createLeopard() async { try { @@ -95,18 +76,48 @@ void createLeopard() async { Transcribe an audio file by passing in the path: ```dart +String audioPath = '{AUDIO_FILE_PATH}' + try { - LeopardTranscript result = await _leopard.processFile("${AUDIO_FILE_PATH}"); + LeopardTranscript result = await _leopard.processFile(audioPath); print(result.transcript); } on LeopardException catch (err) { } ``` -When done, resources have to be released explicitly: +When done, resources must be released explicitly: + +```dart +await _leopard.delete(); +``` + +### Language Model + +Add the Leopard model file to your Flutter application: +1. Create a model in [Picovoice Console](https://console.picovoice.ai/) or use one of the [default language models](https://github.com/Picovoice/leopard/tree/master/lib/common). +2. Add the model file to an `assets` folder in your project directory. +3. Add the asset to your `pubspec.yaml`: +```yaml +flutter: + assets: + - assets/leopard_model.pv +``` +4. In this example, the path to the model file in code would then be as follows: ```dart -leopard.delete(); +String modelPath = "assets/leopard_model.pv"; ``` +Alternatively, if the model file is deployed to the device with a different method, the absolute path to the file on device can be used. + +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demo App For example usage, refer to our [Flutter demo application](https://github.com/Picovoice/leopard/tree/master/demo/flutter). diff --git a/binding/go/README.md b/binding/go/README.md index 673c9c40..9f5d9ebb 100644 --- a/binding/go/README.md +++ b/binding/go/README.md @@ -25,7 +25,7 @@ Leopard is an on-device speech-to-text engine. Leopard is: ## Installation ```console -go get github.com/Picovoice/leopard/binding/go +go get github.com/Picovoice/leopard/binding/go/v2 ``` ## AccessKey @@ -39,7 +39,7 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you Create an instance of the engine and transcribe an audio file: ```go -import . "github.com/Picovoice/leopard/binding/go" +import . "github.com/Picovoice/leopard/binding/go/v2" leopard := NewLeopard("${ACCESS_KEY}") err := leopard.Init() @@ -48,7 +48,7 @@ if err != nil { } defer leopard.Delete() -transcript, words, err := leopard.ProcessFile("${AUDIO_PATH}") +transcript, words, err := leopard.ProcessFile("${AUDIO_FILE_PATH}") if err != nil { // handle process error } @@ -57,7 +57,7 @@ print(transcript) ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using +`${AUDIO_FILE_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using `leopard.Delete()`. ## Language Model @@ -71,10 +71,20 @@ language models with custom vocabulary and boost words in the existing vocabular Pass in the `.pv` file by setting `.ModelPath` on an instance of Leopard before initializing: ```go leopard := NewLeopard("${ACCESS_KEY}") -leopard.ModelPath = "${MODEL_PATH}" +leopard.ModelPath = "${MODEL_FILE_PATH}" err := leopard.Init() + ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demos Check out the Leopard Go demos [here](https://github.com/Picovoice/leopard/tree/master/demo/go). diff --git a/binding/ios/README.md b/binding/ios/README.md index 9541d074..b4909cce 100644 --- a/binding/ios/README.md +++ b/binding/ios/README.md @@ -83,11 +83,20 @@ language models with custom vocabulary and boost words in the existing vocabular Pass in the `.pv` file via the `modelURL` or `modelPath` constructor argument: ```swift -let leopard = Leopard(accessKey: accessKey, modelPath: modelPath) +let leopard = Leopard(accessKey: accessKey, modelPath: "${MODEL_FILE_PATH") // or -let leopard = Leopard(accessKey: accessKey, modelURL: modelURL) +let leopard = Leopard(accessKey: accessKey, modelURL: "${MODEL_FILE_URL}") ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Running Unit Tests Copy your `AccessKey` into the `accessKey` variable in [`LeopardAppTestUITests.swift`](LeopardAppTest/LeopardAppTestUITests/LeopardAppTestUITests.swift). Open `LeopardAppTest.xcworkspace` with XCode and run the tests with `Product > Test`. diff --git a/binding/java/README.md b/binding/java/README.md index bd3720ef..2a441cb5 100644 --- a/binding/java/README.md +++ b/binding/java/README.md @@ -55,20 +55,21 @@ Create an instance of the engine with the Leopard Builder class and transcribe a import ai.picovoice.leopard.*; final String accessKey = "${ACCESS_KEY}"; +final String audioPath = "${AUDIO_FILE_PATH}"; try { Leopard leopard = new Leopard.Builder() .setAccessKey(accessKey) .build(); - LeopardTranscript result = leopard.processFile("${AUDIO_PATH}"); + LeopardTranscript result = leopard.processFile(audioPath); leopard.delete(); } catch (LeopardException ex) { } -System.out.println(transcript); +System.out.println(result.getTranscriptString()); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_PATH}` +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_FILE_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using `leopard.delete()`. @@ -82,12 +83,23 @@ language models with custom vocabulary and boost words in the existing vocabular Pass in the `.pv` file via the `.setModelPath()` Builder argument: ```java +final String modelPath = "${MODEL_FILE_PATH}"; + Leopard leopard = new Leopard.Builder() - .setAccessKey("${ACCESS_KEY}") - .setModelPath("${MODEL_PATH") + .setAccessKey(accessKey) + .setModelPath(modelPath) .build(); ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demo App For example usage, refer to our [Java demos](../../demo/java). diff --git a/binding/nodejs/README.md b/binding/nodejs/README.md index 45183d9c..81aabdd5 100644 --- a/binding/nodejs/README.md +++ b/binding/nodejs/README.md @@ -36,19 +36,21 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you Create an instance of the engine and transcribe an audio file: -```javascript -const {Leopard} = require("@picovoice/leopard-node"); +```typescript +const { Leopard } = require("@picovoice/leopard-node"); const accessKey = "${ACCESS_KEY}"; // Obtained from the Picovoice Console (https://console.picovoice.ai/) -const handle = new Leopard(accessKey); +const audioPath = "${AUDIO_FILE_PATH}" + +const leopard = new Leopard(accessKey); -const result = handle.processFile('${AUDIO_PATH}'); +const result = leopard.processFile(audioPath); console.log(result.transcript); ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using -`handle.release()`. +`${AUDIO_FILE_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using +`leopard.release()`. ## Language Model @@ -60,9 +62,20 @@ language models with custom vocabulary and boost words in the existing vocabular Pass in the `.pv` file via the `modelPath` parameter in the `options` argument of the Leopard constructor: ```javascript -const handle = new Leopard(accessKey, { modelPath: "${MODEL_PATH}"}); +const leopard = new Leopard( + accessKey, + { modelPath: "${MODEL_FILE_PATH}"}); ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demos [Leopard Node.js demo package](https://www.npmjs.com/package/@picovoice/leopard-node-demo) provides command-line utilities for processing audio using leopard. diff --git a/binding/python/README.md b/binding/python/README.md index 042c511a..bdfc1c50 100644 --- a/binding/python/README.md +++ b/binding/python/README.md @@ -41,16 +41,16 @@ import pvleopard leopard = pvleopard.create(access_key='${ACCESS_KEY}') -transcript, words = leopard.process_file('${AUDIO_PATH}') +transcript, words = leopard.process_file('${AUDIO_FILE_PATH}') print(transcript) for word in words: print( - "{word=\"%s\" start_sec=%.2f end_sec=%.2f confidence=%.2f}" - % (word.word, word.start_sec, word.end_sec, word.confidence)) + "{word=\"%s\" start_sec=%.2f end_sec=%.2f confidence=%.2f speaker_tag=%d}" + % (word.word, word.start_sec, word.end_sec, word.confidence, word.speaker_tag)) ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to the path an audio file. +`${AUDIO_FILE_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources: ```python @@ -69,9 +69,18 @@ Pass in the `.pv` file via the `model_path` argument: ```python leopard = pvleopard.create( access_key='${ACCESS_KEY}', - model_path='${MODEL_PATH}') + model_path='${MODEL_FILE_PATH}') ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demos [pvleoparddemo](https://pypi.org/project/pvleoparddemo/) provides command-line utilities for processing audio using diff --git a/binding/react-native/README.md b/binding/react-native/README.md index 86be8e37..4a6a7d7d 100644 --- a/binding/react-native/README.md +++ b/binding/react-native/README.md @@ -50,18 +50,6 @@ Leopard requires a valid Picovoice `AccessKey` at initialization. `AccessKey` ac You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. -## Adding Leopard Models - -Create a custom model using the [Picovoice Console](https://console.picovoice.ai/) or use one of the default language models found in [lib/common](../../lib/common). - -### Android - -To add a Leopard model file to your Android application, add the file as a bundled resource by placing it under the `assets` directory of your Android application. - -### iOS - -To add a Leopard model file to your iOS application, add the file as a bundled resource by selecting Build Phases in `Xcode` and adding it to the `Copy Bundle Resources` step. - ## Usage Create an instance of `Leopard`: @@ -95,6 +83,27 @@ try { Finally, when done be sure to explicitly release the resources using `leopard.delete()`. +### Language Model + +Create a custom model using the [Picovoice Console](https://console.picovoice.ai/) or use one of the default language models found in [lib/common](../../lib/common). + +#### Adding to Android + +To add a Leopard model file to your Android application, add the file as a bundled resource by placing it under the `assets` directory of your Android application. + +#### Adding to iOS + +To add a Leopard model file to your iOS application, add the file as a bundled resource by selecting Build Phases in `Xcode` and adding it to the `Copy Bundle Resources` step. + +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demo App For example usage refer to our [React Native demo application](https://github.com/Picovoice/leopard/tree/master/demo/react-native). diff --git a/binding/react/README.md b/binding/react/README.md index c4ec3133..30087952 100644 --- a/binding/react/README.md +++ b/binding/react/README.md @@ -97,12 +97,13 @@ const leopardModel = { } ``` -Additional engine options are provided via the `options` parameter. Set `enableAutomaticPunctuation` to true if you wish to enable punctuation in the transcript. +Additional engine options are provided via the `options` parameter. Set `enableAutomaticPunctuation` to true if you wish to enable punctuation in the transcript or `enableDiarization` to true if you wish to enable speaker diarization. ```typescript // Optional const options = { - enableAutomaticPunctuation: true + enableAutomaticPunctuation: true, + enableDiarization: true } ``` @@ -133,7 +134,7 @@ const initLeopard = async () => { } ``` -In case of any errors, use the `error` state variable to check the error message. Use the `isLoaded` state variable to check if `Leopard` has loaded. +In case of any errors, use the `error` state variable to check the error message. Use the `isLoaded` state variable to check if `Leopard` has loaded. ### Transcribe Audio @@ -170,7 +171,7 @@ If `WebVoiceProcessor` has started correctly, `isRecording` will be set to true. **Note**: By default, Leopard will only record for 2 minutes before stopping and processing the buffered audio. This is to prevent unbounded memory usage. To increase this limit, call `startRecording` with the optional `maxRecordingSec` parameter: ```typescript -const maxRecordingSec = 60 * 10 +const maxRecordingSec = 60 * 10 await startRecording(maxRecordingSec) ``` @@ -195,8 +196,12 @@ useEffect(() => { }, [result]) ``` -- `transcript`: A string containing the transcribed data. -- `words`: A list of objects containing a `word`, `startSec`, `endSec`, and `confidence`. Each object indicates the start, end time and confidence (between 0 and 1) of the word. +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. ### Release diff --git a/binding/rust/README.md b/binding/rust/README.md index 50f1fbef..2dc8f5c5 100644 --- a/binding/rust/README.md +++ b/binding/rust/README.md @@ -53,15 +53,18 @@ use leopard::LeopardBuilder; fn main() { let access_key = "${ACCESS_KEY}"; // AccessKey obtained from Picovoice Console (https://console.picovoice.ai/) - let leopard: Leopard = LeopardBuilder::new().access_key(access_key).init().expect("Unable to create Leopard"); - if let Ok(leopard_transcript) = leopard.process_file("${AUDIO_PATH}") { + let leopard: Leopard = LeopardBuilder::new() + .access_key(access_key) + .init() + .expect("Unable to create Leopard"); + if let Ok(leopard_transcript) = leopard.process_file("${AUDIO_FILE_PATH}") { println!("{}", leopard_transcript.transcript); } } ``` Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and -`${AUDIO_PATH}` to the path an audio file. +`${AUDIO_FILE_PATH}` to the path an audio file. The model file contains the parameters for the Leopard engine. You may create bespoke language models using [Picovoice Console](https://console.picovoice.ai/) and then pass in the relevant file. @@ -77,11 +80,20 @@ Pass in the `.pv` file via the `.model_path()` Builder argument: ```rust let leopard: Leopard = LeopardBuilder::new() .access_key("${ACCESS_KEY}") - .model_path("${MODEL_PATH}") + .model_path("${MODEL_FILE_PATH}") .init() .expect("Unable to create Leopard"); ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demos The [Leopard Rust demo project](https://github.com/Picovoice/leopard/tree/master/demo/rust) is a Rust console app that allows for processing real-time audio (i.e. microphone) and files using Leopard. diff --git a/binding/web/README.md b/binding/web/README.md index c97d062e..046455cf 100644 --- a/binding/web/README.md +++ b/binding/web/README.md @@ -40,19 +40,19 @@ or using `npm`: npm install --save @picovoice/leopard-web ``` -### AccessKey +## AccessKey Leopard requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Leopard SDKs. You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. -### Usage +## Usage Create a model in [Picovoice Console](https://console.picovoice.ai/) or use one of the default language models found in [lib/common](../../lib/common). For the web packages, there are two methods to initialize Leopard. -#### Public Directory +### Public Directory **NOTE**: Due to modern browser limitations of using a file URL, this method does __not__ work if used without hosting a server. @@ -62,7 +62,7 @@ This method fetches the model file from the public directory and feeds it to Leo cp ${LEOPARD_MODEL_FILE} ${PATH_TO_PUBLIC_DIRECTORY} ``` -#### Base64 +### Base64 **NOTE**: This method works without hosting a server, but increases the size of the model file roughly by 33%. @@ -80,7 +80,7 @@ run: npx pvbase64 -h ``` -#### Leopard Model +### Language Model Leopard saves and caches your model file in IndexedDB to be used by WebAssembly. Use a different `customWritePath` variable to hold multiple models and set the `forceWrite` value to true to force re-save a model file. @@ -100,44 +100,42 @@ const leopardModel = { } ``` -#### Init options - -Set `enableAutomaticPunctuation` to true, if you wish to enable punctuation in transcript. - -```typescript -// Optional -const options = { - enableAutomaticPunctuation: true -} -``` - -#### Initialize Leopard +### Initialize Leopard Create an instance of `Leopard` in the main thread: ```typescript -const handle = await Leopard.create( +const leopard = await Leopard.create( ${ACCESS_KEY}, leopardModel, - options // optional options + options ); ``` Or create an instance of `Leopard` in a worker thread: ```typescript -const handle = await LeopardWorker.create( +const leopard = await LeopardWorker.create( ${ACCESS_KEY}, leopardModel, - options // optional options + options ); ``` -#### Process Audio Frames +Additional configuration options can be passed to `create`. Set `enableAutomaticPunctuation` to true if you wish to enable punctuation in transcript or `enableDiarization` if you wish to enable speaker diarization. + +```typescript +const options = { + enableAutomaticPunctuation: true, + enableDiarization: true +} +``` + +### Process Audio Frames The process result is an object with: - `transcript`: A string containing the transcribed data. -- `words`: A list of objects containing a `word`, `startSec`, `endSec`, and `confidence`. Each object indicates the start, end time and confidence (between 0 and 1) of the word. +- `words`: A list of objects containing a `word`, `startSec`, `endSec`, `confidence` and `speakerTag`. ```typescript function getAudioData(): Int16Array { @@ -145,7 +143,7 @@ function getAudioData(): Int16Array { return new Int16Array(); } -const result = await handle.process(getAudioData()); +const result = await leopard.process(getAudioData()); console.log(result.transcript); console.log(result.words); ``` @@ -154,7 +152,7 @@ For processing using worker, you may consider transferring the buffer instead fo ```typescript let pcm = new Int16Array(); -const result = await handle.process(pcm, { +const result = await leopard.process(pcm, { transfer: true, transferCallback: (data) => { pcm = data } }); @@ -162,22 +160,29 @@ console.log(result.transcript); console.log(result.words); ``` -#### Clean Up +### Clean Up Clean up used resources by `Leopard` or `LeopardWorker`: ```typescript -await handle.release(); +await leopard.release(); ``` -#### Terminate - Terminate `LeopardWorker` instance: ```typescript -await handle.terminate(); +await leopard.terminate(); ``` +### Word Metadata + +Along with the transcript, Leopard returns metadata for each transcribed word. Available metadata items are: + +- **Start Time:** Indicates when the word started in the transcribed audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the transcribed audio. Value is in seconds. +- **Confidence:** Leopard's confidence that the transcribed word is accurate. It is a number within `[0, 1]`. +- **Speaker Tag:** If speaker diarization is enabled on initialization, the speaker tag is a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. If speaker diarization is not enabled, the value will always be `-1`. + ## Demo For example usage refer to our [Web demo application](https://github.com/Picovoice/leopard/tree/master/demo/web). diff --git a/demo/c/README.md b/demo/c/README.md index b172de1f..38d239a7 100644 --- a/demo/c/README.md +++ b/demo/c/README.md @@ -44,7 +44,7 @@ usage: -a ACCESS_KEY -l LIBRARY_PATH -m MODEL_PATH [-d] [-v] audio_path0 audio_p ``` Run the command corresponding to your platform from the root of the repository. Replace `${ACCESS_KEY}` with yours -obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_PATH}` with the path to an audio file you +obtained from [Picovoice Console](https://console.picovoice.ai/) and `${AUDIO_FILE_PATH}` with the path to an audio file you want to transcribe. Use the `-d` flag to disable automatic punctuation. @@ -59,7 +59,7 @@ Use the `-v` flag to enable the printing of word metadata. -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/linux/x86_64/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### macOS (x86_64) @@ -69,7 +69,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/mac/x86_64/libpv_leopard.dylib \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### macOS (arm64) @@ -79,7 +79,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/mac/arm64/libpv_leopard.dylib \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### Windows @@ -91,7 +91,7 @@ demo\\c\\build\\leopard_demo.exe ^ -a ${ACCESS_KEY} ^ -m lib\\common\\leopard_params.pv ^ -l lib\\windows\\amd64\\libpv_leopard.dll ^ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### Raspberry Pi 4 @@ -101,7 +101,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/raspberry-pi/cortex-a72/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### Raspberry Pi 4 (64-bit) @@ -111,7 +111,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/raspberry-pi/cortex-a72-aarch64/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### Raspberry Pi 3 @@ -121,7 +121,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/raspberry-pi/cortex-a53/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### Raspberry Pi 3 (64-bit) @@ -131,7 +131,7 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/raspberry-pi/cortex-a53-aarch64/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` #### NVIDIA Jetson Nano @@ -141,5 +141,5 @@ ${AUDIO_PATH} -a ${ACCESS_KEY} \ -m lib/common/leopard_params.pv \ -l lib/jetson/cortex-a57-aarch64/libpv_leopard.so \ -${AUDIO_PATH} +${AUDIO_FILE_PATH} ``` diff --git a/demo/dotnet/README.md b/demo/dotnet/README.md index 526841ce..02a7315c 100644 --- a/demo/dotnet/README.md +++ b/demo/dotnet/README.md @@ -62,12 +62,12 @@ Leopard/demo/dotnet/LeopardDemo Run the following in the terminal: ```console -dotnet run -c FileDemo.Release -- \ ---input_audio_path ${AUDIO_PATH} \ +dotnet run -c FileDemo.Release -- \ +--input_audio_path ${AUDIO_FILE_PATH} \ --access_key ${ACCESS_KEY} \ ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### Microphone Demo @@ -91,9 +91,9 @@ It provides information about various audio input devices on the box. This is an ``` index: 0, device name: USB Audio Device index: 1, device name: MacBook Air Microphone -``` +``` -You can use the device index to specify which microphone to use for the demo. For instance, if you want to use the USB Audio Device +You can use the device index to specify which microphone to use for the demo. For instance, if you want to use the USB Audio Device in the above example, you can invoke the demo application as below: ```console diff --git a/demo/go-grpc/README.md b/demo/go-grpc/README.md index 7efb04ab..ed2f30dc 100644 --- a/demo/go-grpc/README.md +++ b/demo/go-grpc/README.md @@ -31,7 +31,7 @@ While the server is up, make a transcription request: ```console go run client.go \ ---input_audio "${AUDIO_PATH}" +--input_audio "${AUDIO_FILE_PATH}" `````` -Replace `${AUDIO_PATH}` with a path to an audio file you wish to transcribe. \ No newline at end of file +Replace `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. \ No newline at end of file diff --git a/demo/go/README.md b/demo/go/README.md index da153aaa..461c4388 100644 --- a/demo/go/README.md +++ b/demo/go/README.md @@ -6,7 +6,7 @@ Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) Leopard is an on-device speech-to-text engine. Leopard is: -- Private; All voice processing runs locally. +- Private; All voice processing runs locally. - Accurate [[1]](https://github.com/Picovoice/speech-to-text-benchmark#results) - Compact and Computationally-Efficient [[2]](https://github.com/Picovoice/speech-to-text-benchmark#rtf) - Cross-Platform: @@ -40,11 +40,11 @@ Run the following in the terminal: ```console go run filedemo/leopard_file_demo.go \ --input_audio_path "${AUDIO_PATH}" \ +-input_audio_path "${AUDIO_FILE_PATH}" \ -access_key "${ACCESS_KEY}" ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### Microphone Demo @@ -75,7 +75,7 @@ It provides information about various audio input devices on the box. Here is an ```console index: 0, device name: USB Audio Device index: 1, device name: MacBook Air Microphone -``` +``` You can use the device index to specify which microphone to use for the demo. For instance, if you want to use the USB Audio Device in the above example, you can invoke the demo application as below: diff --git a/demo/java/README.md b/demo/java/README.md index 95302033..b12c8012 100644 --- a/demo/java/README.md +++ b/demo/java/README.md @@ -49,7 +49,7 @@ cd leopard/demo/java/build/libs The file demo uses Leopard to get speech-to-text results from an audio file. ```console -java -jar leopard-file-demo.jar -a ${ACCESS_KEY} -i ${AUDIO_PATH} +java -jar leopard-file-demo.jar -a ${ACCESS_KEY} -i ${AUDIO_FILE_PATH} ``` ### Microphone Demo diff --git a/demo/nodejs/README.md b/demo/nodejs/README.md index 1c9bb85b..2ae7ee4a 100644 --- a/demo/nodejs/README.md +++ b/demo/nodejs/README.md @@ -39,10 +39,10 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you Run the following in the terminal: ```console -leopard-file-demo --access_key ${ACCESS_KEY} --input_audio_file_path ${AUDIO_PATH} +leopard-file-demo --access_key ${ACCESS_KEY} --input_audio_file_path ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### Microphone Demo diff --git a/demo/python-subtitle/README.md b/demo/python-subtitle/README.md index d62c2eb0..e7396a47 100644 --- a/demo/python-subtitle/README.md +++ b/demo/python-subtitle/README.md @@ -32,7 +32,7 @@ or any audio file: ```console python3 demo/python-subtitle/main.py \ --access-key ${ACCESS_KEY} \ ---audio-path ${AUDIO_PATH} \ +--audio-path ${AUDIO_FILE_PATH} \ --subtitle-path ${SUBTITLE_PATH} ``` diff --git a/demo/python/README.md b/demo/python/README.md index 6b6e0f0a..28295ea8 100644 --- a/demo/python/README.md +++ b/demo/python/README.md @@ -6,7 +6,7 @@ Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) Leopard is an on-device speech-to-text engine. Leopard is: -- Private; All voice processing runs locally. +- Private; All voice processing runs locally. - [Accurate](https://picovoice.ai/docs/benchmark/stt/) - [Compact and Computationally-Efficient](https://github.com/Picovoice/speech-to-text-benchmark#rtf) - Cross-Platform: @@ -39,10 +39,10 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you Run the following in the terminal: ```console -leopard_demo_file --access_key ${ACCESS_KEY} --audio_paths ${AUDIO_PATH} +leopard_demo_file --access_key ${ACCESS_KEY} --audio_paths ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### Microphone Demo @@ -56,7 +56,7 @@ leopard_demo_mic --access_key ${ACCESS_KEY} Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console. Once running, the demo prints: ```console ->>> Press `ENTER` to start: +>>> Press `ENTER` to start: ``` Press `ENTER` key and wait for the following message in the terminal: diff --git a/demo/rust/README.md b/demo/rust/README.md index ebec8753..a5b9d52a 100644 --- a/demo/rust/README.md +++ b/demo/rust/README.md @@ -57,10 +57,10 @@ leopard/demo/rust/micdemo # Microphone Demo Run the following in the terminal: ```console -cargo run --release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_PATH} +cargo run --release -- --access_key ${ACCESS_KEY} --input_audio_path ${AUDIO_FILE_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_PATH}` with a path to an audio file you +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${AUDIO_FILE_PATH}` with a path to an audio file you wish to transcribe. ### Microphone Demo