SHAREVOX core の API を探る（３）OpenAL

Explore the SHAREVOX core API (3) OpenAL

2022/12/21
藤田昭人

昨日の「マクラでモタつくとその日のうちに書き上げるのは難しい」を教訓に…

毎年、僕を講義に呼んでくれる吉田さんが先日の僕が担当した講義の様子をブログ（ https://ndsi.kyo2.jp/d2022-12-18.html ）で紹介してくれました。でも、この「音声チャットボット」について僕が語り始めるとまたまた収まらなくなるので…

昨日の t2s の続きを進めます。

sharevox_tts が生成するデータ

昨日の記事でも書きましたが、せっかく対話インターフェースをつけたのに読み上げてくれないと興醒めなので、 WAVデータを再生する機能が欲しいところ。

そこで、まずはWAVデータを生成する sharevox_tts の出力を見てみました。次のソースコード（抜粋）の箇所です。

  result = sharevox_tts(text.c_str(), speaker_id, &output_binary_size, &output_wav);
  if (result != SHAREVOX_RESULT_SUCCEED) {
    std::cout << sharevox_error_result_to_message(result) << std::endl;
    return 1;
  }

  std::cout << "音声ファイル保存中..." << std::endl;

  std::ofstream wav_file(OUTPUT_WAV_NAME, std::ios::binary);
  wav_file.write(reinterpret_cast<const char*>(output_wav), output_binary_size);
  sharevox_wav_free(output_wav);

コードからもわかるように、どうやら（ヘッダーを含む） WAVデータが丸々メモリー上に格納されているようです。で、音声データを再生する類似の実装を幾つかチェックしてみたのですが、データを一旦ファイルに格納し音声データを再生するコマンド（macOSの場合はafplayとか）を起動する実装が一般的でした。なんかイマイチなやり方ですねぇ…

WAVファイルの再生方法

そこでオンメモリのWAVデータを（ファイルに格納する事なく）再生できるオープンソースを探してみました。いずれもWAVファイルを再生するツールだったのですが、オンメモリ・データの再生に改造しやすそうな次のサンプルを見つけました。

github.com

このプログラムには元ネタがあるようです*1。

WAVファイルのヘッダーの情報などがきっちり定義されていて、最初の入力ファイルの読み込み部分さえ外せば上手くいきそう…と思ったのですが、作業してみたらちょっと上手くないところがありました。これは後述の「play.cpp」のところで説明します。

OpenAL

上記のサンプルは OpenAL なるライブラリの利用を仮定しています。
例によって、詳細は Wikipedia から…

ja.wikipedia.org

どうやら、ゲームプログラミングでは著名なライブラリのようです。オリジナルを開発した Loki Software は既に廃業しているようで、今は次の団体が引き継いでるとか…

www.openal.org

でも、オープンソース版は次のサイトからダウンロードできるようです*2。

openal-soft.org

MacBook を使っている僕の場合、例によってホームブリューでインストールするのが簡単ですね。

formulae.brew.sh

t2s を喋らせる実装

今回は昨日の記事に対するアップデートです。

CMakeLists.txt

CMakeLists.txt の修正箇所は次のとおりです。

$ diff -u CMakeLists.txt.orig CMakeLists.txt
--- CMakeLists.txt.orig 2022-12-21 10:42:15.000000000 +0900
+++ CMakeLists.txt  2022-12-10 23:26:14.000000000 +0900
@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.16)
 project(T2S)
 set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations -Wno-writable-strings")
+message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

 find_path(PARENT NAMES "sharevox_core-0.1.2" PATHS "../..")
 message(STATUS "PARENT: ${PARENT}")
@@ -8,6 +10,9 @@
 message(STATUS "TOP_DIR: ${TOP_DIR}")
 find_library(CORE_LIB NAMES core PATHS "${TOP_DIR}/core" PATH_SUFFIXES lib)
 message(STATUS "CORE_LIB: ${CORE_LIB}")
+set(OPENAL_DIR "/usr/local/opt/openal-soft")
+message(STATUS "OPENAL_DIR: ${OPENAL_DIR}")

-add_executable(t2s t2s.cpp)
-target_link_libraries(t2s ${CORE_LIB} readline)
+link_directories("${OPENAL_DIR}/lib")
+add_executable(t2s t2s.cpp play.cpp)
+target_link_libraries(t2s ${CORE_LIB} readline openal)
$

G++ のコンパイル・オプションを追加し、 OpenAL のライブラリもリンク対象に含めました。

t2s.cpp

t2s.cpp は内部コマンドを幾つか追加しました。

$ diff -u t2s.cpp.orig t2s.cpp
--- t2s.cpp.orig    2022-12-21 10:42:34.000000000 +0900
+++ t2s.cpp 2022-12-11 23:36:11.000000000 +0900
@@ -2,26 +2,28 @@
 #include <string.h>
 #include <stdlib.h>

-#include "../core/src/core.h"
+extern void play(uint8_t *output, int size);

-#define    MODEL       "../../model"
-#define    OPENJTALK_DIC   "../open_jtalk_dic_utf_8-1.11"
-#define    OUTPUT_WAV_NAME "audio.wav"
+
+#include "../core/src/core.h"

 void
-init()
+init(const char *model, const char *dic)
 {
   SharevoxResultCode result;

-  if (!initialize(MODEL, false)) {
+  printf("model: %s\n", model);
+  if (!initialize(model, false)) {
     printf("coreの初期化に失敗しました\n");
     exit(1);
   } else {
     printf("coreの初期化に成功しました\n");
   }

-  printf("openjtalk辞書の読み込み中...");
-  result = sharevox_load_openjtalk_dict(OPENJTALK_DIC);
+  //printf("metas: %s\n", metas());
+
+  printf("dic: %s\n", dic);
+  result = sharevox_load_openjtalk_dict(dic);
   if (result != SHAREVOX_RESULT_SUCCEED) {
     printf("\n%s\n", sharevox_error_result_to_message(result));
     exit(1);
@@ -75,6 +77,27 @@
 }


+// wave format
+
+void
+show(uint8_t *wav, int wsiz)
+{
+  printf("%d bytes\n\n", wsiz);
+  printf("RIFF: %c%c%c%c\n", wav[0], wav[1], wav[2], wav[3]);
+  printf("size: %d\n",            *((int32_t *) &wav[4]));
+  printf("WAVE: %c%c%c%c\n", wav[8], wav[9], wav[10], wav[11]);
+  printf(" fmt: %c%c%c%c\n", wav[12], wav[13], wav[14], wav[15]);
+  printf("wFormatLength: %d\n",   *((int32_t *) &wav[16]));
+  printf("wFormatTag: %d\n",      *((int16_t *) &wav[20]));
+  printf("nChannels: %d\n",       *((int16_t *) &wav[22]));
+  printf("nSamplesPerSec: %d\n",  *((int32_t *) &wav[24]));
+  printf("nAvgBytesPerSec: %d\n", *((int32_t *) &wav[28]));
+  printf("nBlockAlign: %d\n",     *((int16_t *) &wav[32]));
+  printf("wBitsPerSample: %d\n",  *((int16_t *) &wav[34]));
+  printf("ov_data: %c%c%c%c\n", wav[36], wav[37], wav[38], wav[39]);
+  printf("ov_datasize: %d\n",     *((int32_t *) &wav[40]));
+}
+
 // readline

 #include <readline/readline.h>
@@ -92,8 +115,12 @@
   return(-1);
 }

+#define    MODEL       "../../model"
+#define    OPENJTALK_DIC   "../open_jtalk_dic_utf_8-1.11"
+#define    OUTPUT_WAV_NAME "audio.wav"
+
 int
-main()
+main(int argc, char *argv[])
 {
   char *line = NULL;
   int n;
@@ -101,9 +128,16 @@
   SharevoxResultCode result;
   int64_t speaker_id = 0;
   int output_binary_size = 0;
-  uint8_t *output_wav = nullptr;
+  uint8_t *output_wav = NULL;

-  init();
+  char *model = MODEL;
+  char *dic = OPENJTALK_DIC;
+
+  if (argc > 1) {
+    model = argv[1];
+  }
+
+  init(model, dic);

   while (1) {
     line = readline("> ");
@@ -120,17 +154,29 @@
             break;
       }
     }
+
     if (n == 3) {
-      printf("[ja] %s\n", line);
       if (generate(line,
                    speaker_id,
                    &output_binary_size,
                    &output_wav) < 0) exit(1);
-      if (save(OUTPUT_WAV_NAME, output_wav, output_binary_size) < 0) exit(1);
-      sharevox_wav_free(output_wav);
+      if (output_wav != NULL) play(output_wav, output_binary_size);
+      //printf("# %d bytes\n", output_binary_size);
+    } else if (n == 1) {
+      if (strcmp(line, "save") == 0) {
+        if (save(OUTPUT_WAV_NAME, output_wav, output_binary_size) < 0) exit(1);
+        sharevox_wav_free(output_wav);
+         output_wav = NULL;
+      } else if (strcmp(line, "show") == 0) {
+        if (output_wav != NULL) show(output_wav, output_binary_size);
+      } else if (strcmp(line, "p") == 0 ||
+        strcmp(line, "play") == 0) {
+   if (output_wav != NULL) play(output_wav, output_binary_size);
+      }
     } else {
       printf("[%d] %s\n", n, line);
     }
+
     add_history(line);
     free(line);
   }
$

追加した内部コマンドについては後述します。

play.cpp

さて、本稿のホットポイントの play.cpp です。関数 play はOpenAL を使って WAVデータを再生する関数です。関数の前半では WAVヘッダーから情報を取り出し、後半では OpenAL で再生しています。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

#include <OpenAL/al.h>
#include <OpenAL/alc.h>

#define WAVH_HEADER_SIZE 44
#define WAVH_CHANNELS_MONO 1
#define WAVH_CHANNELS_STEREO 2
#define WAVH_SAMPLINGRATE_CD 44100
#define  WAVH_BITSPERSAMPLE8 8
#define  WAVH_BITSPERSAMPLE16 16

ALenum getFormat(int wavChannels,  int wavBit)
{
    ALenum format;
    if(wavChannels == WAVH_CHANNELS_MONO){
        if(wavBit == 8) {
            format = AL_FORMAT_MONO8;
        }else if(wavBit == 16) {
            format = AL_FORMAT_MONO16;
        }
    }else if(wavChannels == WAVH_CHANNELS_STEREO){
        if(wavBit== 8){
            format = AL_FORMAT_STEREO8;
        }else if(wavBit == 16) {
            format = AL_FORMAT_STEREO16;
        }
    }
    return format;
}

typedef struct tag_wav_header {   
    int RIFF;                // 'R','I','F','F'                
    int size;                   // size of wave file from here on
    int WAVE;             // 'W','A','V','E'
    int fmt;            //'f','m','t',' '
    int   wFormatLength;          // The length of the TAG format    
    short    wFormatTag;             // should be 1 for PCM type ov_data  
    short    nChannels;              // should be 1 for MONO type ov_data
    int   nSamplesPerSec;         // should be 11025, 22050, 44100  
    int   nAvgBytesPerSec;        // Average Data Rate              
    short    nBlockAlign;            // 1 for 8 bit ov_data, 2 for 16 bit
    short    wBitsPerSample;         // 8 for 8 bit ov_data, 16 for 16 bit
    int ov_data;                // 'd','a','t','a'                        
    int   ov_datasize;               // size of ov_data from here on              

  unsigned char data[0];
} wav_header ;

const int   WAVH_RIFF = 0x46464952; // "RIFF"
const int   WAVH_WAVE =  0x45564157; // "WAVE"
const int   WAVH_FMT = 0x20746D66; // "'fmt"
const int   WAVH_OV_DATA = 0x61746164;    // "ov_data"
const int   WAVH_WFORMATLENGTH = 16;
const short WAVH_WFORMATTAG_PCM = 1;

void
play(uint8_t *output, int size)
{
  wav_header *hp = (wav_header *) output;

  printf("### play\n");
  //printf("RIFF: 0x%x\n", hp->RIFF);
    if (hp->RIFF != WAVH_RIFF) {
        printf("NOT match riff\n");
    return;
  }
    //printf("size: %d\n", hp->size);
  //printf("WAVE: 0x%x\n", hp->WAVE);
    if (hp->WAVE != WAVH_WAVE) {
        printf("NOT match wave\n");
        return;
    }
  //printf(" fmt: 0x%x\n", hp->fmt);
    if (hp->fmt != WAVH_FMT) {
        printf("NOT match fmt\n");
        return;
    }
    //printf("wFormatLength: %d\n", hp->wFormatLength);
    //printf("wFormatTag: %d\n", hp->wFormatTag);
  if (hp->wFormatTag != WAVH_WFORMATTAG_PCM) {
    printf("wFormatTag should be 1\n");
    return;
  }  

    short wavchannels   = hp->nChannels;
    // nSamplesPerSec（サンプリング周波数）と
    // nAvgBytesPerSec（１秒あたりのバイト数）の違いを説明
    printf("hp->nSamplesPerSec: %d \n", hp->nSamplesPerSec);
    printf("hp->nAvgBytesPerSec: %d \n", hp->nAvgBytesPerSec);
    int   samplesPerSec = hp->nSamplesPerSec;
  int   byteParSec    = hp->nAvgBytesPerSec;
    //printf("byteParSec: %d \n", byteParSec);
  short blockAlign    = hp->nBlockAlign;
    //printf("blockAlign: %d \n", blockAlign);
  short bitsParSample = hp->wBitsPerSample;
    //printf("bitsParSample: %d \n", bitsParSample);
    
  printf("ov_datasize: %d\n", hp->ov_datasize);
  //printf("ov_data: 0x%x\n", hp->ov_data);
  if (hp->ov_data != WAVH_OV_DATA) {
        printf("NOT match ov data\n");
      return;
  }

  int wavChannels     = wavchannels;
  int wavBit          = bitsParSample;
  int wavSize         = hp->ov_datasize;
  int wavSamplingrate = samplesPerSec;

  printf("wavChannels: %d \n", wavChannels);
    printf("wavBit: %d \n", wavBit);
    printf("wavSize: %d \n", wavSize);
    printf("wavSamplingrate: %d \n", wavSamplingrate);

  //int time_playback = (float)wavSize / (float)(4*wavSamplingrate);
    int playback_ms = ((float)wavSize / (float)byteParSec) * 1000.0F;
    printf("playback_ms: %d msec \n", playback_ms);
    unsigned char *data = hp->data;

    ALuint source;
    ALuint buffer;
  ALCdevice *device = alcOpenDevice(NULL);
    if (!device) { printf("alcOpenDevice Faild\n"); return; }
    ALCcontext *context = alcCreateContext(device, NULL);
    if (!context) { printf("alcCreateContext Faild\n"); return; }

    alcMakeContextCurrent(context);
    alGenSources (1, &source);
    alGenBuffers(1, &buffer);

    ALenum format = getFormat(wavChannels,  wavBit);
    alBufferData(buffer, format, data, wavSize, wavSamplingrate);
    alSourcei(source, AL_BUFFER, buffer);
    alSourcePlay(source);

    //printf("alSourcePlay \n");
    int time_count; 
    for (time_count = playback_ms; time_count > 0; time_count--) {
        usleep(1000);
    }

    alDeleteBuffers(1, &buffer);
    alDeleteSources(1, &source);
}

OpenAL は OpenGL とよく似たインターフェースなんだそうですが、要は関数 alSourcei でPCMデータ（WAVデータ）をハードウェアにセットし、関数 alSourcePlay で音声再生をキックする… デバイスドライバーみたいなコードを書くようです。音声再生自体はハードウェアで実行されるのですが、ソフトウェアから見ればバックグラウンドで実行されているように見えます。この時、ソフトウェアはデータの再生時間を計算してウェイトループに入ります。再生時間が過ぎると関数 alDeleteBuffers をコールしてハードウェアの音声再生を停止します。

で、前述のオリジナル・サンプルの問題点ですが、ひとつは nSamplesPerSec（サンプリング周波数）とnAvgBytesPerSec（１秒あたりのバイト数）の意味を取り違えてるように思える事、もうひとつは再生時間の単位を１秒にしている事です。そのためオリジナル・サンプルのコードのままだと WAVデータを最後まで再生せずに処理を終了してしまいます。（つまり尻切れトンボ状態になります）そこで再生待ちのループをミリ秒単位に変更しました。音声の場合、ミリ秒ぐらいまで人間は聞き分けてしまいますからね。

ビルドと実行

ビルドの手順は昨日の記事と変わりません。

$ cd sharevox_core-0.1.2/t2s/
$ ls
CMakeLists.txt          play.cpp
model               t2s.cpp
open_jtalk_dic_utf_8-1.11
$ rm -rf build
$ mkdir build
$ cd build
$ cmake ..
-- The C compiler identification is AppleClang 13.0.0.13000029
-- The CXX compiler identification is AppleClang 13.0.0.13000029
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- CMAKE_CXX_FLAGS: -Wno-deprecated-declarations -Wno-writable-strings
-- PARENT: /Users/fujita/xtr/BookBot/BookBot3/05_T2S_OpenAL
-- TOP_DIR: /Users/fujita/xtr/BookBot/BookBot3/05_T2S_OpenAL/sharevox_core-0.1.2
-- CORE_LIB: /Users/fujita/xtr/BookBot/BookBot3/05_T2S_OpenAL/sharevox_core-0.1.2/core/lib/libcore.dylib
-- OPENAL_DIR: /usr/local/opt/openal-soft
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/fujita/xtr/BookBot/BookBot3/05_T2S_OpenAL/sharevox_core-0.1.2/t2s/build
maverick:build fujita$ cmake --build .
[ 33%] Building CXX object CMakeFiles/t2s.dir/t2s.cpp.o
[ 66%] Building CXX object CMakeFiles/t2s.dir/play.cpp.o
[100%] Linking CXX executable t2s
[100%] Built target t2s
$

但し、t2s ディレクトリには SHAREVOX の公式リリースに添付されている model ディレクトリも配置しました*3。この公式リリースの model ディレクトリには４キャラクターの声が格納されています。

次に実行方法なんですが…

昨日の記事でも説明したように、起動後は初期化処理のため２０秒弱待たされます。また内部コマンドは１バイト文字（＝英字）、３バイト文字（≒漢字）は発話テキストと認識されることは変わりません。

さらにこのバージョンでは次の model を選択できるようにしました。コマンド起動時に第１引数として model ディレクトリのパスを受け付けます。デフォルト（引数なし）では "../../model" が指定されます。上記の build ディレクトリからだと "sharevox_core-0.1.2/model" が指定され SHAREVOX core に付属する男性の声のみが収録されています。引数に "../model" を指定すると "sharevox_core-0.1.2/t2s/model" 前述のSHAREVOX公式リリースの４声が発声できるようになります。

音が聞こえないのであまり意味はないですがコマンドのログは以下のとおりです。

$ ./t2s
model: ../../model
coreの初期化に成功しました
dic: ../open_jtalk_dic_utf_8-1.11
終了
> こんにちは
音声生成中...終了: 69504 bytes
### play
hp->nSamplesPerSec: 48000
hp->nAvgBytesPerSec: 96000
ov_datasize: 69460
wavChannels: 1
wavBit: 16
wavSize: 69460
wavSamplingrate: 48000
playback_ms: 723 msec
> q
exit
$ ./t2s ../model
model: ../model
coreの初期化に成功しました
dic: ../open_jtalk_dic_utf_8-1.11
終了
> こんにちは
音声生成中...終了: 69848 bytes
### play
hp->nSamplesPerSec: 48000
hp->nAvgBytesPerSec: 96000
ov_datasize: 69804
wavChannels: 1
wavBit: 16
wavSize: 69804
wavSamplingrate: 48000
playback_ms: 727 msec
> q
exit
$

…ということで

ようやく t2s を喋らせることができました。明日は t2s の最後のお色直しとして、 model ディレクトリに収録されているキャラクターの一覧を表示する機能などを追加します。あと、t2s が動いている動画も貼り付けるよう頑張ってみるつもりです。

それではお約束の…

#つくよみちゃんを利用してフォロワー増やしたい

以上

*1:付属のメモランダムから拾い出した URL は次のページでした。

tips.hecomi.com

こちらはオリジナルの OpenJTalk+HTS を対象にしているようです。

*2:どういう事情かよく知りません。

*3:以前、紹介したと思いますが、次の Github の SHAREVOX 0.1.0 のリポジトリ…

github.com

…に格納されている sharevox_model-0.1.0.zip というZIPファイルです。