tts add voice choose (#2391)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
@@ -15,29 +15,37 @@ class TTSModel(AIModel):
|
||||
"""
|
||||
model_type: ModelType = ModelType.TTS
|
||||
|
||||
def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||
user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: translated audio file
|
||||
"""
|
||||
try:
|
||||
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text)
|
||||
self._is_ffmpeg_installed()
|
||||
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
|
||||
content_text=content_text, voice=voice, tenant_id=tenant_id)
|
||||
except Exception as e:
|
||||
raise self._transform_invoke_error(e)
|
||||
|
||||
@abstractmethod
|
||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||
user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
@@ -45,7 +53,22 @@ class TTSModel(AIModel):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_model_voice(self, model: str, credentials: dict) -> any:
|
||||
def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list:
|
||||
"""
|
||||
Get voice for given tts model voices
|
||||
|
||||
:param language: tts language
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:return: voices lists
|
||||
"""
|
||||
model_schema = self.get_model_schema(model, credentials)
|
||||
|
||||
if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties:
|
||||
voices = model_schema.model_properties[ModelPropertyKey.VOICES]
|
||||
return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')]
|
||||
|
||||
def _get_model_default_voice(self, model: str, credentials: dict) -> any:
|
||||
"""
|
||||
Get voice for given tts model
|
||||
|
||||
|
||||
@@ -1,7 +1,31 @@
|
||||
model: tts-1-hd
|
||||
model: tts-1
|
||||
model_type: tts
|
||||
model_properties:
|
||||
default_voice: 'alloy'
|
||||
voices:
|
||||
- mode: 'alloy'
|
||||
name: 'Alloy'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'echo'
|
||||
name: 'Echo'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'fable'
|
||||
name: 'Fable'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'onyx'
|
||||
name: 'Onyx'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'nova'
|
||||
name: 'Nova'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'shimmer'
|
||||
name: 'Shimmer'
|
||||
language: ['zh-CN', 'en-US']
|
||||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
pricing:
|
||||
input: '0.03'
|
||||
output: '0'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
|
||||
@@ -2,6 +2,30 @@ model: tts-1
|
||||
model_type: tts
|
||||
model_properties:
|
||||
default_voice: 'alloy'
|
||||
voices:
|
||||
- mode: 'alloy'
|
||||
name: 'Alloy'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'echo'
|
||||
name: 'Echo'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'fable'
|
||||
name: 'Fable'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'onyx'
|
||||
name: 'Onyx'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'nova'
|
||||
name: 'Nova'
|
||||
language: ['zh-CN', 'en-US']
|
||||
- mode: 'shimmer'
|
||||
name: 'Shimmer'
|
||||
language: ['zh-CN', 'en-US']
|
||||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
pricing:
|
||||
input: '0.015'
|
||||
output: '0'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
|
||||
@@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
||||
from extensions.ext_storage import storage
|
||||
|
||||
|
||||
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
||||
"""
|
||||
Model class for OpenAI Speech to text model.
|
||||
"""
|
||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||
|
||||
def _invoke(self, model: str, tenant_id: str, credentials: dict,
|
||||
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||
"""
|
||||
_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param voice: model timbre
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
self._is_ffmpeg_installed()
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
if not voice:
|
||||
voice = self._get_model_default_voice(model, credentials)
|
||||
if streaming:
|
||||
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||
credentials=credentials,
|
||||
content_text=content_text,
|
||||
user=user)),
|
||||
tenant_id=tenant_id,
|
||||
voice=voice)),
|
||||
status=200, mimetype=f'audio/{audio_type}')
|
||||
else:
|
||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
|
||||
|
||||
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||
"""
|
||||
@@ -52,91 +59,96 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
||||
self._tts_invoke(
|
||||
model=model,
|
||||
credentials=credentials,
|
||||
content_text='Hello world!',
|
||||
user=user
|
||||
content_text='Hello Dify!',
|
||||
voice=self._get_model_default_voice(model, credentials),
|
||||
)
|
||||
except Exception as ex:
|
||||
raise CredentialsValidateFailedError(str(ex))
|
||||
|
||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
||||
"""
|
||||
_tts_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:param voice: model timbre
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
max_workers = self._get_model_workers_limit(model, credentials)
|
||||
|
||||
try:
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
audio_bytes_list = list()
|
||||
|
||||
# Create a thread pool and map the function to the list of sentences
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence
|
||||
in sentences]
|
||||
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
|
||||
credentials=credentials) for sentence in sentences]
|
||||
for future in futures:
|
||||
try:
|
||||
audio_bytes_list.append(future.result())
|
||||
if future.result():
|
||||
audio_bytes_list.append(future.result())
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||
audio_bytes_list if audio_bytes]
|
||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||
buffer: BytesIO = BytesIO()
|
||||
combined_segment.export(buffer, format=audio_type)
|
||||
buffer.seek(0)
|
||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||
if len(audio_bytes_list) > 0:
|
||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||
audio_bytes_list if audio_bytes]
|
||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||
buffer: BytesIO = BytesIO()
|
||||
combined_segment.export(buffer, format=audio_type)
|
||||
buffer.seek(0)
|
||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
# Todo: To improve the streaming function
|
||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
|
||||
voice: str) -> any:
|
||||
"""
|
||||
_tts_invoke_streaming text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:param voice: model timbre
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
if not voice:
|
||||
voice = self._get_model_default_voice(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
tts_file_id = self._get_file_name(content_text)
|
||||
file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}'
|
||||
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
|
||||
try:
|
||||
client = OpenAI(**credentials_kwargs)
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
for sentence in sentences:
|
||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
||||
response.stream_to_file(file_path)
|
||||
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
||||
# response.stream_to_file(file_path)
|
||||
storage.save(file_path, response.read())
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
def _process_sentence(self, sentence: str, model: str, credentials: dict):
|
||||
def _process_sentence(self, sentence: str, model: str,
|
||||
voice, credentials: dict):
|
||||
"""
|
||||
_tts_invoke openai text2speech model api
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param sentence: text content to be translated
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
|
||||
client = OpenAI(**credentials_kwargs)
|
||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
||||
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
||||
if isinstance(response.read(), bytes):
|
||||
return response.read()
|
||||
|
||||
@@ -1,7 +1,134 @@
|
||||
model: tts-1
|
||||
model_type: tts
|
||||
model_properties:
|
||||
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
|
||||
default_voice: 'sambert-zhiru-v1'
|
||||
voices:
|
||||
- mode: "sambert-zhinan-v1"
|
||||
name: "知楠(广告男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiqi-v1"
|
||||
name: "知琪(温柔女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhichu-v1"
|
||||
name: "知厨(新闻播报)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhide-v1"
|
||||
name: "知德(新闻男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhijia-v1"
|
||||
name: "知佳(标准女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiru-v1"
|
||||
name: "知茹(新闻女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiqian-v1"
|
||||
name: "知倩(配音解说、新闻播报)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhixiang-v1"
|
||||
name: "知祥(配音解说)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiwei-v1"
|
||||
name: "知薇(萝莉女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhihao-v1"
|
||||
name: "知浩(咨询男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhijing-v1"
|
||||
name: "知婧(严厉女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiming-v1"
|
||||
name: "知茗(诙谐男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhimo-v1"
|
||||
name: "知墨(情感男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhina-v1"
|
||||
name: "知娜(浙普女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhishu-v1"
|
||||
name: "知树(资讯男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhistella-v1"
|
||||
name: "知莎(知性女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiting-v1"
|
||||
name: "知婷(电台女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhixiao-v1"
|
||||
name: "知笑(资讯女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiya-v1"
|
||||
name: "知雅(严厉女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiye-v1"
|
||||
name: "知晔(青年男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiying-v1"
|
||||
name: "知颖(软萌童声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhiyuan-v1"
|
||||
name: "知媛(知心姐姐)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhigui-v1"
|
||||
name: "知柜(直播女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhishuo-v1"
|
||||
name: "知硕(自然男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhimiao-emo-v1"
|
||||
name: "知妙(多种情感女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhimao-v1"
|
||||
name: "知猫(直播女声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhilun-v1"
|
||||
name: "知伦(悬疑解说)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhifei-v1"
|
||||
name: "知飞(激昂解说)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-zhida-v1"
|
||||
name: "知达(标准男声)"
|
||||
language: [ "zh-CN", "en-US" ]
|
||||
- mode: "sambert-camila-v1"
|
||||
name: "Camila(西班牙语女声)"
|
||||
language: [ "es-ES" ]
|
||||
- mode: "sambert-perla-v1"
|
||||
name: "Perla(意大利语女声)"
|
||||
language: [ "it-IT" ]
|
||||
- mode: "sambert-indah-v1"
|
||||
name: "Indah(印尼语女声)"
|
||||
language: [ "id-ID" ]
|
||||
- mode: "sambert-clara-v1"
|
||||
name: "Clara(法语女声)"
|
||||
language: [ "fr-FR" ]
|
||||
- mode: "sambert-hanna-v1"
|
||||
name: "Hanna(德语女声)"
|
||||
language: [ "de-DE" ]
|
||||
- mode: "sambert-beth-v1"
|
||||
name: "Beth(咨询女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-betty-v1"
|
||||
name: "Betty(客服女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-cally-v1"
|
||||
name: "Cally(自然女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-cindy-v1"
|
||||
name: "Cindy(对话女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-eva-v1"
|
||||
name: "Eva(陪伴女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-donna-v1"
|
||||
name: "Donna(教育女声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-brian-v1"
|
||||
name: "Brian(客服男声)"
|
||||
language: [ "en-US" ]
|
||||
- mode: "sambert-waan-v1"
|
||||
name: "Waan(泰语女声)"
|
||||
language: [ "th-TH" ]
|
||||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
|
||||
@@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
|
||||
from extensions.ext_storage import storage
|
||||
|
||||
|
||||
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
||||
"""
|
||||
Model class for Tongyi Speech to text model.
|
||||
"""
|
||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||
|
||||
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||
user: Optional[str] = None) -> any:
|
||||
"""
|
||||
_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
self._is_ffmpeg_installed()
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
if not voice:
|
||||
voice = self._get_model_default_voice(model, credentials)
|
||||
if streaming:
|
||||
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||
credentials=credentials,
|
||||
content_text=content_text,
|
||||
user=user)),
|
||||
voice=voice,
|
||||
tenant_id=tenant_id)),
|
||||
status=200, mimetype=f'audio/{audio_type}')
|
||||
else:
|
||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
|
||||
|
||||
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||
"""
|
||||
@@ -52,91 +59,96 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
||||
self._tts_invoke(
|
||||
model=model,
|
||||
credentials=credentials,
|
||||
content_text='Hello world!',
|
||||
user=user
|
||||
content_text='Hello Dify!',
|
||||
voice=self._get_model_default_voice(model, credentials),
|
||||
)
|
||||
except Exception as ex:
|
||||
raise CredentialsValidateFailedError(str(ex))
|
||||
|
||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
||||
"""
|
||||
_tts_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
max_workers = self._get_model_workers_limit(model, credentials)
|
||||
|
||||
try:
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
audio_bytes_list = list()
|
||||
|
||||
# Create a thread pool and map the function to the list of sentences
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
|
||||
credentials=credentials, audio_type=audio_type) for sentence in sentences]
|
||||
futures = [executor.submit(self._process_sentence, sentence=sentence,
|
||||
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
|
||||
sentences]
|
||||
for future in futures:
|
||||
try:
|
||||
audio_bytes_list.append(future.result())
|
||||
if future.result():
|
||||
audio_bytes_list.append(future.result())
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||
audio_bytes_list if audio_bytes]
|
||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||
buffer: BytesIO = BytesIO()
|
||||
combined_segment.export(buffer, format=audio_type)
|
||||
buffer.seek(0)
|
||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||
if len(audio_bytes_list) > 0:
|
||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||
audio_bytes_list if audio_bytes]
|
||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||
buffer: BytesIO = BytesIO()
|
||||
combined_segment.export(buffer, format=audio_type)
|
||||
buffer.seek(0)
|
||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
# Todo: To improve the streaming function
|
||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
|
||||
voice: str) -> any:
|
||||
"""
|
||||
_tts_invoke_streaming text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param tenant_id: user tenant id
|
||||
:param credentials: model credentials
|
||||
:param voice: model timbre
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
tts_file_id = self._get_file_name(content_text)
|
||||
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
|
||||
try:
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
for sentence in sentences:
|
||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
|
||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
|
||||
text=sentence.strip(),
|
||||
format=audio_type, word_timestamp_enabled=True,
|
||||
phoneme_timestamp_enabled=True)
|
||||
if isinstance(response.get_audio_data(), bytes):
|
||||
return response.get_audio_data()
|
||||
storage.save(file_path, response.get_audio_data())
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
|
||||
@staticmethod
|
||||
def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str):
|
||||
"""
|
||||
_tts_invoke Tongyi text2speech model api
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param sentence: text content to be translated
|
||||
:param voice: model timbre
|
||||
:param audio_type: audio file type
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
|
||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
|
||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
|
||||
text=sentence.strip(),
|
||||
format=audio_type)
|
||||
if isinstance(response.get_audio_data(), bytes):
|
||||
return response.get_audio_data()
|
||||
|
||||
Reference in New Issue
Block a user