1 /* ----------------------------------------------------------------- */
2 /* The HMM-Based Speech Synthesis Engine "hts_engine API" */
3 /* developed by HTS Working Group */
4 /* http://hts-engine.sourceforge.net/ */
5 /* ----------------------------------------------------------------- */
7 /* Copyright (c) 2001-2013 Nagoya Institute of Technology */
8 /* Department of Computer Science */
10 /* 2001-2008 Tokyo Institute of Technology */
11 /* Interdisciplinary Graduate School of */
12 /* Science and Engineering */
14 /* All rights reserved. */
16 /* Redistribution and use in source and binary forms, with or */
17 /* without modification, are permitted provided that the following */
18 /* conditions are met: */
20 /* - Redistributions of source code must retain the above copyright */
21 /* notice, this list of conditions and the following disclaimer. */
22 /* - Redistributions in binary form must reproduce the above */
23 /* copyright notice, this list of conditions and the following */
24 /* disclaimer in the documentation and/or other materials provided */
25 /* with the distribution. */
26 /* - Neither the name of the HTS working group nor the names of its */
27 /* contributors may be used to endorse or promote products derived */
28 /* from this software without specific prior written permission. */
30 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
31 /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
32 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
33 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
34 /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
35 /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
36 /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
37 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
38 /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
39 /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
40 /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
41 /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
42 /* POSSIBILITY OF SUCH DAMAGE. */
43 /* ----------------------------------------------------------------- */
49 #define HTS_SSTREAM_C_START extern "C" {
50 #define HTS_SSTREAM_C_END }
52 #define HTS_SSTREAM_C_START
53 #define HTS_SSTREAM_C_END
54 #endif /* __CPLUSPLUS */
61 /* hts_engine libraries */
62 #include "HTS_hidden.h"
64 /* HTS_set_default_duration: set default duration from state duration probability distribution */
65 static double HTS_set_default_duration(size_t * duration, double *mean, double *vari, size_t size)
71 for (i = 0; i < size; i++) {
76 duration[i] = (size_t) temp;
83 /* HTS_set_specified_duration: set duration from state duration probability distribution and specified frame length */
84 static double HTS_set_specified_duration(size_t * duration, double *mean, double *vari, size_t size, double frame_length)
93 /* get the target frame length */
94 if (frame_length + 0.5 < 1.0)
97 target_length = (size_t) (frame_length + 0.5);
99 /* check the specified duration */
100 if (target_length <= size) {
101 if (target_length < size)
102 HTS_error(-1, "HTS_set_specified_duration: Specified frame length is too short.\n");
103 for (i = 0; i < size; i++)
105 return (double) size;
108 /* RHO calculation */
111 for (i = 0; i < size; i++) {
115 rho = ((double) target_length - temp1) / temp2;
117 /* first estimation */
118 for (i = 0; i < size; i++) {
119 temp1 = mean[i] + rho * vari[i] + 0.5;
123 duration[i] = (size_t) temp1;
127 /* loop estimation */
128 while (target_length != sum) {
129 /* sarch flexible state and modify its duration */
130 if (target_length > sum) {
132 for (i = 0; i < size; i++) {
133 temp2 = fabs(rho - ((double) duration[i] + 1 - mean[i]) / vari[i]);
134 if (j < 0 || temp1 > temp2) {
143 for (i = 0; i < size; i++) {
144 if (duration[i] > 1) {
145 temp2 = fabs(rho - ((double) duration[i] - 1 - mean[i]) / vari[i]);
146 if (j < 0 || temp1 > temp2) {
157 return (double) target_length;
160 /* HTS_SStreamSet_initialize: initialize state stream set */
161 void HTS_SStreamSet_initialize(HTS_SStreamSet * sss)
166 sss->duration = NULL;
167 sss->total_state = 0;
168 sss->total_frame = 0;
171 /* HTS_SStreamSet_create: parse label and determine state duration */
172 HTS_Boolean HTS_SStreamSet_create(HTS_SStreamSet * sss, HTS_ModelSet * ms, HTS_Label * label, HTS_Boolean phoneme_alignment_flag, double speed, double *duration_iw, double **parameter_iw, double **gv_iw)
179 double *duration_mean, *duration_vari;
184 /* check interpolation weights */
185 for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++)
186 temp += duration_iw[i];
189 } else if (temp != 1.0) {
190 for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
191 if (duration_iw[i] != 0.0)
192 duration_iw[i] /= temp;
195 for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) {
196 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
197 temp += parameter_iw[i][j];
200 } else if (temp != 1.0) {
201 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
202 if (parameter_iw[i][j] != 0.0)
203 parameter_iw[i][j] /= temp;
205 if (HTS_ModelSet_use_gv(ms, i)) {
206 for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
210 else if (temp != 1.0)
211 for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
212 if (gv_iw[i][j] != 0.0)
217 /* initialize state sequence */
218 sss->nstate = HTS_ModelSet_get_nstate(ms);
219 sss->nstream = HTS_ModelSet_get_nstream(ms);
220 sss->total_frame = 0;
221 sss->total_state = HTS_Label_get_size(label) * sss->nstate;
222 sss->duration = (size_t *) HTS_calloc(sss->total_state, sizeof(size_t));
223 sss->sstream = (HTS_SStream *) HTS_calloc(sss->nstream, sizeof(HTS_SStream));
224 for (i = 0; i < sss->nstream; i++) {
225 sst = &sss->sstream[i];
226 sst->vector_length = HTS_ModelSet_get_vector_length(ms, i);
227 sst->mean = (double **) HTS_calloc(sss->total_state, sizeof(double *));
228 sst->vari = (double **) HTS_calloc(sss->total_state, sizeof(double *));
229 if (HTS_ModelSet_is_msd(ms, i))
230 sst->msd = (double *) HTS_calloc(sss->total_state, sizeof(double));
233 for (j = 0; j < sss->total_state; j++) {
234 sst->mean[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double));
235 sst->vari[j] = (double *) HTS_calloc(sst->vector_length * HTS_ModelSet_get_window_size(ms, i), sizeof(double));
237 if (HTS_ModelSet_use_gv(ms, i)) {
238 sst->gv_switch = (HTS_Boolean *) HTS_calloc(sss->total_state, sizeof(HTS_Boolean));
239 for (j = 0; j < sss->total_state; j++)
240 sst->gv_switch[j] = TRUE;
242 sst->gv_switch = NULL;
246 /* determine state duration */
247 duration_mean = (double *) HTS_calloc(sss->total_state, sizeof(double));
248 duration_vari = (double *) HTS_calloc(sss->total_state, sizeof(double));
249 for (i = 0; i < HTS_Label_get_size(label); i++)
250 HTS_ModelSet_get_duration(ms, HTS_Label_get_string(label, i), duration_iw, &duration_mean[i * sss->nstate], &duration_vari[i * sss->nstate]);
251 if (phoneme_alignment_flag == TRUE) {
252 /* use duration set by user */
256 for (i = 0; i < HTS_Label_get_size(label); i++) {
257 temp = HTS_Label_get_end_frame(label, i);
259 next_time += (size_t) HTS_set_specified_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state, temp - next_time);
260 next_state = state + sss->nstate;
261 } else if (i + 1 == HTS_Label_get_size(label)) {
262 HTS_error(-1, "HTS_SStreamSet_create: The time of final label is not specified.\n");
263 HTS_set_default_duration(&sss->duration[next_state], &duration_mean[next_state], &duration_vari[next_state], state + sss->nstate - next_state);
265 state += sss->nstate;
268 /* determine frame length */
271 for (i = 0; i < sss->total_state; i++) {
272 temp += duration_mean[i];
274 frame_length = temp / speed;
275 HTS_set_specified_duration(sss->duration, duration_mean, duration_vari, sss->total_state, frame_length);
277 HTS_set_default_duration(sss->duration, duration_mean, duration_vari, sss->total_state);
280 HTS_free(duration_mean);
281 HTS_free(duration_vari);
284 for (i = 0, state = 0; i < HTS_Label_get_size(label); i++) {
285 for (j = 2; j <= sss->nstate + 1; j++) {
286 sss->total_frame += sss->duration[state];
287 for (k = 0; k < sss->nstream; k++) {
288 sst = &sss->sstream[k];
290 HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), parameter_iw[k], sst->mean[state], sst->vari[state], &sst->msd[state]);
292 HTS_ModelSet_get_parameter(ms, k, j, HTS_Label_get_string(label, i), parameter_iw[k], sst->mean[state], sst->vari[state], NULL);
298 /* copy dynamic window */
299 for (i = 0; i < sss->nstream; i++) {
300 sst = &sss->sstream[i];
301 sst->win_size = HTS_ModelSet_get_window_size(ms, i);
302 sst->win_max_width = HTS_ModelSet_get_window_max_width(ms, i);
303 sst->win_l_width = (int *) HTS_calloc(sst->win_size, sizeof(int));
304 sst->win_r_width = (int *) HTS_calloc(sst->win_size, sizeof(int));
305 sst->win_coefficient = (double **) HTS_calloc(sst->win_size, sizeof(double));
306 for (j = 0; j < sst->win_size; j++) {
307 sst->win_l_width[j] = HTS_ModelSet_get_window_left_width(ms, i, j);
308 sst->win_r_width[j] = HTS_ModelSet_get_window_right_width(ms, i, j);
309 if (sst->win_l_width[j] + sst->win_r_width[j] == 0)
310 sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j] + 1, sizeof(double));
312 sst->win_coefficient[j] = (double *) HTS_calloc(-2 * sst->win_l_width[j], sizeof(double));
313 sst->win_coefficient[j] -= sst->win_l_width[j];
314 for (shift = sst->win_l_width[j]; shift <= sst->win_r_width[j]; shift++)
315 sst->win_coefficient[j][shift] = HTS_ModelSet_get_window_coefficient(ms, i, j, shift);
320 for (i = 0; i < sss->nstream; i++) {
321 sst = &sss->sstream[i];
322 if (HTS_ModelSet_use_gv(ms, i)) {
323 sst->gv_mean = (double *) HTS_calloc(sst->vector_length, sizeof(double));
324 sst->gv_vari = (double *) HTS_calloc(sst->vector_length, sizeof(double));
325 HTS_ModelSet_get_gv(ms, i, HTS_Label_get_string(label, 0), gv_iw[i], sst->gv_mean, sst->gv_vari);
332 for (i = 0; i < HTS_Label_get_size(label); i++)
333 if (HTS_ModelSet_get_gv_flag(ms, HTS_Label_get_string(label, i)) == FALSE)
334 for (j = 0; j < sss->nstream; j++)
335 if (HTS_ModelSet_use_gv(ms, j) == TRUE)
336 for (k = 0; k < sss->nstate; k++)
337 sss->sstream[j].gv_switch[i * sss->nstate + k] = FALSE;
342 /* HTS_SStreamSet_get_nstream: get number of stream */
343 size_t HTS_SStreamSet_get_nstream(HTS_SStreamSet * sss)
348 /* HTS_SStreamSet_get_vector_length: get vector length */
349 size_t HTS_SStreamSet_get_vector_length(HTS_SStreamSet * sss, size_t stream_index)
351 return sss->sstream[stream_index].vector_length;
354 /* HTS_SStreamSet_is_msd: get MSD flag */
355 HTS_Boolean HTS_SStreamSet_is_msd(HTS_SStreamSet * sss, size_t stream_index)
357 return sss->sstream[stream_index].msd ? TRUE : FALSE;
360 /* HTS_SStreamSet_get_total_state: get total number of state */
361 size_t HTS_SStreamSet_get_total_state(HTS_SStreamSet * sss)
363 return sss->total_state;
366 /* HTS_SStreamSet_get_total_frame: get total number of frame */
367 size_t HTS_SStreamSet_get_total_frame(HTS_SStreamSet * sss)
369 return sss->total_frame;
372 /* HTS_SStreamSet_get_msd: get MSD parameter */
373 double HTS_SStreamSet_get_msd(HTS_SStreamSet * sss, size_t stream_index, size_t state_index)
375 return sss->sstream[stream_index].msd[state_index];
378 /* HTS_SStreamSet_window_size: get dynamic window size */
379 size_t HTS_SStreamSet_get_window_size(HTS_SStreamSet * sss, size_t stream_index)
381 return sss->sstream[stream_index].win_size;
384 /* HTS_SStreamSet_get_window_left_width: get left width of dynamic window */
385 int HTS_SStreamSet_get_window_left_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index)
387 return sss->sstream[stream_index].win_l_width[window_index];
390 /* HTS_SStreamSet_get_winodow_right_width: get right width of dynamic window */
391 int HTS_SStreamSet_get_window_right_width(HTS_SStreamSet * sss, size_t stream_index, size_t window_index)
393 return sss->sstream[stream_index].win_r_width[window_index];
396 /* HTS_SStreamSet_get_window_coefficient: get coefficient of dynamic window */
397 double HTS_SStreamSet_get_window_coefficient(HTS_SStreamSet * sss, size_t stream_index, size_t window_index, int coefficient_index)
399 return sss->sstream[stream_index].win_coefficient[window_index][coefficient_index];
402 /* HTS_SStreamSet_get_window_max_width: get max width of dynamic window */
403 size_t HTS_SStreamSet_get_window_max_width(HTS_SStreamSet * sss, size_t stream_index)
405 return sss->sstream[stream_index].win_max_width;
408 /* HTS_SStreamSet_use_gv: get GV flag */
409 HTS_Boolean HTS_SStreamSet_use_gv(HTS_SStreamSet * sss, size_t stream_index)
411 return sss->sstream[stream_index].gv_mean ? TRUE : FALSE;
414 /* HTS_SStreamSet_get_duration: get state duration */
415 size_t HTS_SStreamSet_get_duration(HTS_SStreamSet * sss, size_t state_index)
417 return sss->duration[state_index];
420 /* HTS_SStreamSet_get_mean: get mean parameter */
421 double HTS_SStreamSet_get_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index)
423 return sss->sstream[stream_index].mean[state_index][vector_index];
426 /* HTS_SStreamSet_set_mean: set mean parameter */
427 void HTS_SStreamSet_set_mean(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f)
429 sss->sstream[stream_index].mean[state_index][vector_index] = f;
432 /* HTS_SStreamSet_get_vari: get variance parameter */
433 double HTS_SStreamSet_get_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index)
435 return sss->sstream[stream_index].vari[state_index][vector_index];
438 /* HTS_SStreamSet_set_vari: set variance parameter */
439 void HTS_SStreamSet_set_vari(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, size_t vector_index, double f)
441 sss->sstream[stream_index].vari[state_index][vector_index] = f;
444 /* HTS_SStreamSet_get_gv_mean: get GV mean parameter */
445 double HTS_SStreamSet_get_gv_mean(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index)
447 return sss->sstream[stream_index].gv_mean[vector_index];
450 /* HTS_SStreamSet_get_gv_mean: get GV variance parameter */
451 double HTS_SStreamSet_get_gv_vari(HTS_SStreamSet * sss, size_t stream_index, size_t vector_index)
453 return sss->sstream[stream_index].gv_vari[vector_index];
456 /* HTS_SStreamSet_set_gv_switch: set GV switch */
457 void HTS_SStreamSet_set_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index, HTS_Boolean i)
459 sss->sstream[stream_index].gv_switch[state_index] = i;
462 /* HTS_SStreamSet_get_gv_switch: get GV switch */
463 HTS_Boolean HTS_SStreamSet_get_gv_switch(HTS_SStreamSet * sss, size_t stream_index, size_t state_index)
465 return sss->sstream[stream_index].gv_switch[state_index];
468 /* HTS_SStreamSet_clear: free state stream set */
469 void HTS_SStreamSet_clear(HTS_SStreamSet * sss)
475 for (i = 0; i < sss->nstream; i++) {
476 sst = &sss->sstream[i];
477 for (j = 0; j < sss->total_state; j++) {
478 HTS_free(sst->mean[j]);
479 HTS_free(sst->vari[j]);
485 for (j = 0; j < sst->win_size; j++) {
486 sst->win_coefficient[j] += sst->win_l_width[j];
487 HTS_free(sst->win_coefficient[j]);
489 HTS_free(sst->win_coefficient);
490 HTS_free(sst->win_l_width);
491 HTS_free(sst->win_r_width);
493 HTS_free(sst->gv_mean);
495 HTS_free(sst->gv_vari);
497 HTS_free(sst->gv_switch);
499 HTS_free(sss->sstream);
502 HTS_free(sss->duration);
504 HTS_SStreamSet_initialize(sss);
509 #endif /* !HTS_SSTREAM_C */