tts models support (#2033)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com>
This commit is contained in:
@@ -30,7 +30,7 @@ ENV TZ UTC
|
||||
WORKDIR /app/api
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends bash curl wget vim nodejs \
|
||||
&& apt-get install -y --no-install-recommends bash curl wget vim nodejs ffmpeg \
|
||||
&& apt-get autoremove \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -124,6 +124,7 @@ def load_user_from_request(request_from_flask_login):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@login_manager.unauthorized_handler
|
||||
def unauthorized_handler():
|
||||
"""Handle unauthorized requests."""
|
||||
|
||||
@@ -32,9 +32,10 @@ class ChatMessageAudioApi(Resource):
|
||||
file = request.files['file']
|
||||
|
||||
try:
|
||||
response = AudioService.transcript(
|
||||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
promot=app_model.app_model_config.pre_prompt
|
||||
)
|
||||
|
||||
return response
|
||||
@@ -62,6 +63,48 @@ class ChatMessageAudioApi(Resource):
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
|
||||
|
||||
class ChatMessageTextApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self, app_id):
|
||||
app_id = str(app_id)
|
||||
app_model = _get_app(app_id, None)
|
||||
try:
|
||||
response = AudioService.transcript_tts(
|
||||
tenant_id=app_model.tenant_id,
|
||||
text=request.form['text'],
|
||||
streaming=False
|
||||
)
|
||||
|
||||
return {'data': response.data.decode('latin1')}
|
||||
except services.errors.app_model_config.AppModelConfigBrokenError:
|
||||
logging.exception("App model config broken.")
|
||||
raise AppUnavailableError()
|
||||
except NoAudioUploadedServiceError:
|
||||
raise NoAudioUploadedError()
|
||||
except AudioTooLargeServiceError as e:
|
||||
raise AudioTooLargeError(str(e))
|
||||
except UnsupportedAudioTypeServiceError:
|
||||
raise UnsupportedAudioTypeError()
|
||||
except ProviderNotSupportSpeechToTextServiceError:
|
||||
raise ProviderNotSupportSpeechToTextError()
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ProviderNotInitializeError(ex.description)
|
||||
except QuotaExceededError:
|
||||
raise ProviderQuotaExceededError()
|
||||
except ModelCurrentlyNotSupportError:
|
||||
raise ProviderModelCurrentlyNotSupportError()
|
||||
except InvokeError as e:
|
||||
raise CompletionRequestError(e.description)
|
||||
except ValueError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
|
||||
api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio')
|
||||
|
||||
@@ -29,7 +29,7 @@ class ChatAudioApi(InstalledAppResource):
|
||||
file = request.files['file']
|
||||
|
||||
try:
|
||||
response = AudioService.transcript(
|
||||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
)
|
||||
@@ -59,6 +59,48 @@ class ChatAudioApi(InstalledAppResource):
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
|
||||
|
||||
class ChatTextApi(InstalledAppResource):
|
||||
def post(self, installed_app):
|
||||
app_model = installed_app.app
|
||||
app_model_config: AppModelConfig = app_model.app_model_config
|
||||
|
||||
if not app_model_config.text_to_speech_dict['enabled']:
|
||||
raise AppUnavailableError()
|
||||
|
||||
try:
|
||||
response = AudioService.transcript_tts(
|
||||
tenant_id=app_model.tenant_id,
|
||||
text=request.form['text'],
|
||||
streaming=False
|
||||
)
|
||||
return {'data': response.data.decode('latin1')}
|
||||
except services.errors.app_model_config.AppModelConfigBrokenError:
|
||||
logging.exception("App model config broken.")
|
||||
raise AppUnavailableError()
|
||||
except NoAudioUploadedServiceError:
|
||||
raise NoAudioUploadedError()
|
||||
except AudioTooLargeServiceError as e:
|
||||
raise AudioTooLargeError(str(e))
|
||||
except UnsupportedAudioTypeServiceError:
|
||||
raise UnsupportedAudioTypeError()
|
||||
except ProviderNotSupportSpeechToTextServiceError:
|
||||
raise ProviderNotSupportSpeechToTextError()
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ProviderNotInitializeError(ex.description)
|
||||
except QuotaExceededError:
|
||||
raise ProviderQuotaExceededError()
|
||||
except ModelCurrentlyNotSupportError:
|
||||
raise ProviderModelCurrentlyNotSupportError()
|
||||
except InvokeError as e:
|
||||
raise CompletionRequestError(e.description)
|
||||
except ValueError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
|
||||
api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
|
||||
|
||||
@@ -31,6 +31,7 @@ class AppParameterApi(InstalledAppResource):
|
||||
'suggested_questions': fields.Raw,
|
||||
'suggested_questions_after_answer': fields.Raw,
|
||||
'speech_to_text': fields.Raw,
|
||||
'text_to_speech': fields.Raw,
|
||||
'retriever_resource': fields.Raw,
|
||||
'annotation_reply': fields.Raw,
|
||||
'more_like_this': fields.Raw,
|
||||
@@ -51,6 +52,7 @@ class AppParameterApi(InstalledAppResource):
|
||||
'suggested_questions': app_model_config.suggested_questions_list,
|
||||
'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
|
||||
'speech_to_text': app_model_config.speech_to_text_dict,
|
||||
'text_to_speech': app_model_config.text_to_speech_dict,
|
||||
'retriever_resource': app_model_config.retriever_resource_dict,
|
||||
'annotation_reply': app_model_config.annotation_reply_dict,
|
||||
'more_like_this': app_model_config.more_like_this_dict,
|
||||
|
||||
@@ -33,6 +33,7 @@ class AppParameterApi(AppApiResource):
|
||||
'suggested_questions': fields.Raw,
|
||||
'suggested_questions_after_answer': fields.Raw,
|
||||
'speech_to_text': fields.Raw,
|
||||
'text_to_speech': fields.Raw,
|
||||
'retriever_resource': fields.Raw,
|
||||
'annotation_reply': fields.Raw,
|
||||
'more_like_this': fields.Raw,
|
||||
@@ -52,6 +53,7 @@ class AppParameterApi(AppApiResource):
|
||||
'suggested_questions': app_model_config.suggested_questions_list,
|
||||
'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
|
||||
'speech_to_text': app_model_config.speech_to_text_dict,
|
||||
'text_to_speech': app_model_config.text_to_speech_dict,
|
||||
'retriever_resource': app_model_config.retriever_resource_dict,
|
||||
'annotation_reply': app_model_config.annotation_reply_dict,
|
||||
'more_like_this': app_model_config.more_like_this_dict,
|
||||
|
||||
@@ -10,6 +10,7 @@ from controllers.service_api.wraps import AppApiResource
|
||||
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
|
||||
from core.model_runtime.errors.invoke import InvokeError
|
||||
from flask import request
|
||||
from flask_restful import reqparse
|
||||
from models.model import App, AppModelConfig
|
||||
from services.audio_service import AudioService
|
||||
from services.errors.audio import (AudioTooLargeServiceError, NoAudioUploadedServiceError,
|
||||
@@ -22,14 +23,15 @@ class AudioApi(AppApiResource):
|
||||
app_model_config: AppModelConfig = app_model.app_model_config
|
||||
|
||||
if not app_model_config.speech_to_text_dict['enabled']:
|
||||
raise AppUnavailableError()
|
||||
raise AppUnavailableError()
|
||||
|
||||
file = request.files['file']
|
||||
|
||||
try:
|
||||
response = AudioService.transcript(
|
||||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
end_user=end_user
|
||||
)
|
||||
|
||||
return response
|
||||
@@ -57,5 +59,49 @@ class AudioApi(AppApiResource):
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
api.add_resource(AudioApi, '/audio-to-text')
|
||||
|
||||
|
||||
class TextApi(AppApiResource):
|
||||
def post(self, app_model: App, end_user):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('text', type=str, required=True, nullable=False, location='json')
|
||||
parser.add_argument('user', type=str, required=True, nullable=False, location='json')
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
response = AudioService.transcript_tts(
|
||||
tenant_id=app_model.tenant_id,
|
||||
text=args['text'],
|
||||
end_user=args['user'],
|
||||
streaming=False
|
||||
)
|
||||
|
||||
return response
|
||||
except services.errors.app_model_config.AppModelConfigBrokenError:
|
||||
logging.exception("App model config broken.")
|
||||
raise AppUnavailableError()
|
||||
except NoAudioUploadedServiceError:
|
||||
raise NoAudioUploadedError()
|
||||
except AudioTooLargeServiceError as e:
|
||||
raise AudioTooLargeError(str(e))
|
||||
except UnsupportedAudioTypeServiceError:
|
||||
raise UnsupportedAudioTypeError()
|
||||
except ProviderNotSupportSpeechToTextServiceError:
|
||||
raise ProviderNotSupportSpeechToTextError()
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ProviderNotInitializeError(ex.description)
|
||||
except QuotaExceededError:
|
||||
raise ProviderQuotaExceededError()
|
||||
except ModelCurrentlyNotSupportError:
|
||||
raise ProviderModelCurrentlyNotSupportError()
|
||||
except InvokeError as e:
|
||||
raise CompletionRequestError(e.description)
|
||||
except ValueError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(AudioApi, '/audio-to-text')
|
||||
api.add_resource(TextApi, '/text-to-audio')
|
||||
|
||||
@@ -32,6 +32,7 @@ class AppParameterApi(WebApiResource):
|
||||
'suggested_questions': fields.Raw,
|
||||
'suggested_questions_after_answer': fields.Raw,
|
||||
'speech_to_text': fields.Raw,
|
||||
'text_to_speech': fields.Raw,
|
||||
'retriever_resource': fields.Raw,
|
||||
'annotation_reply': fields.Raw,
|
||||
'more_like_this': fields.Raw,
|
||||
@@ -51,6 +52,7 @@ class AppParameterApi(WebApiResource):
|
||||
'suggested_questions': app_model_config.suggested_questions_list,
|
||||
'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
|
||||
'speech_to_text': app_model_config.speech_to_text_dict,
|
||||
'text_to_speech': app_model_config.text_to_speech_dict,
|
||||
'retriever_resource': app_model_config.retriever_resource_dict,
|
||||
'annotation_reply': app_model_config.annotation_reply_dict,
|
||||
'more_like_this': app_model_config.more_like_this_dict,
|
||||
|
||||
@@ -28,7 +28,7 @@ class AudioApi(WebApiResource):
|
||||
file = request.files['file']
|
||||
|
||||
try:
|
||||
response = AudioService.transcript(
|
||||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
)
|
||||
@@ -59,4 +59,43 @@ class AudioApi(WebApiResource):
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
api.add_resource(AudioApi, '/audio-to-text')
|
||||
|
||||
class TextApi(WebApiResource):
|
||||
def post(self, app_model: App, end_user):
|
||||
try:
|
||||
response = AudioService.transcript_tts(
|
||||
tenant_id=app_model.tenant_id,
|
||||
text=request.form['text'],
|
||||
end_user=end_user.external_user_id,
|
||||
streaming=False
|
||||
)
|
||||
|
||||
return {'data': response.data.decode('latin1')}
|
||||
except services.errors.app_model_config.AppModelConfigBrokenError:
|
||||
logging.exception("App model config broken.")
|
||||
raise AppUnavailableError()
|
||||
except NoAudioUploadedServiceError:
|
||||
raise NoAudioUploadedError()
|
||||
except AudioTooLargeServiceError as e:
|
||||
raise AudioTooLargeError(str(e))
|
||||
except UnsupportedAudioTypeServiceError:
|
||||
raise UnsupportedAudioTypeError()
|
||||
except ProviderNotSupportSpeechToTextServiceError:
|
||||
raise ProviderNotSupportSpeechToTextError()
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ProviderNotInitializeError(ex.description)
|
||||
except QuotaExceededError:
|
||||
raise ProviderQuotaExceededError()
|
||||
except ModelCurrentlyNotSupportError:
|
||||
raise ProviderModelCurrentlyNotSupportError()
|
||||
except InvokeError as e:
|
||||
raise CompletionRequestError(e.description)
|
||||
except ValueError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
logging.exception("internal server error.")
|
||||
raise InternalServerError()
|
||||
|
||||
|
||||
api.add_resource(AudioApi, '/audio-to-text')
|
||||
api.add_resource(TextApi, '/text-to-audio')
|
||||
|
||||
@@ -555,6 +555,12 @@ class ApplicationManager:
|
||||
if 'enabled' in speech_to_text_dict and speech_to_text_dict['enabled']:
|
||||
properties['speech_to_text'] = True
|
||||
|
||||
# text to speech
|
||||
text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech')
|
||||
if text_to_speech_dict:
|
||||
if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']:
|
||||
properties['text_to_speech'] = True
|
||||
|
||||
# sensitive word avoidance
|
||||
sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance')
|
||||
if sensitive_word_avoidance_dict:
|
||||
|
||||
@@ -219,6 +219,7 @@ class AppOrchestrationConfigEntity(BaseModel):
|
||||
show_retrieve_source: bool = False
|
||||
more_like_this: bool = False
|
||||
speech_to_text: bool = False
|
||||
text_to_speech: bool = False
|
||||
sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None
|
||||
|
||||
|
||||
@@ -283,7 +284,6 @@ class ApplicationGenerateEntity(BaseModel):
|
||||
query: Optional[str] = None
|
||||
files: list[FileObj] = []
|
||||
user_id: str
|
||||
|
||||
# extras
|
||||
stream: bool
|
||||
invoke_from: InvokeFrom
|
||||
|
||||
@@ -12,6 +12,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large
|
||||
from core.model_runtime.model_providers.__base.moderation_model import ModerationModel
|
||||
from core.model_runtime.model_providers.__base.rerank_model import RerankModel
|
||||
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
|
||||
from core.provider_manager import ProviderManager
|
||||
|
||||
@@ -144,7 +145,7 @@ class ModelInstance:
|
||||
user=user
|
||||
)
|
||||
|
||||
def invoke_speech2text(self, file: IO[bytes], user: Optional[str] = None, **params) \
|
||||
def invoke_speech2text(self, file: IO[bytes], user: Optional[str] = None) \
|
||||
-> str:
|
||||
"""
|
||||
Invoke large language model
|
||||
@@ -161,8 +162,29 @@ class ModelInstance:
|
||||
model=self.model,
|
||||
credentials=self.credentials,
|
||||
file=file,
|
||||
user=user
|
||||
)
|
||||
|
||||
def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \
|
||||
-> str:
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:param streaming: output is streaming
|
||||
:return: text for given audio file
|
||||
"""
|
||||
if not isinstance(self.model_type_instance, TTSModel):
|
||||
raise Exception(f"Model type instance is not TTSModel")
|
||||
|
||||
self.model_type_instance = cast(TTSModel, self.model_type_instance)
|
||||
return self.model_type_instance.invoke(
|
||||
model=self.model,
|
||||
credentials=self.credentials,
|
||||
content_text=content_text,
|
||||
user=user,
|
||||
**params
|
||||
streaming=streaming
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ class ModelType(Enum):
|
||||
RERANK = "rerank"
|
||||
SPEECH2TEXT = "speech2text"
|
||||
MODERATION = "moderation"
|
||||
# TTS = "tts"
|
||||
TTS = "tts"
|
||||
# TEXT2IMG = "text2img"
|
||||
|
||||
@classmethod
|
||||
@@ -33,6 +33,8 @@ class ModelType(Enum):
|
||||
return cls.RERANK
|
||||
elif origin_model_type == 'speech2text' or origin_model_type == cls.SPEECH2TEXT.value:
|
||||
return cls.SPEECH2TEXT
|
||||
elif origin_model_type == 'tts' or origin_model_type == cls.TTS.value:
|
||||
return cls.TTS
|
||||
elif origin_model_type == cls.MODERATION.value:
|
||||
return cls.MODERATION
|
||||
else:
|
||||
@@ -52,6 +54,8 @@ class ModelType(Enum):
|
||||
return 'reranking'
|
||||
elif self == self.SPEECH2TEXT:
|
||||
return 'speech2text'
|
||||
elif self == self.TTS:
|
||||
return 'tts'
|
||||
elif self == self.MODERATION:
|
||||
return 'moderation'
|
||||
else:
|
||||
@@ -120,6 +124,10 @@ class ModelPropertyKey(Enum):
|
||||
FILE_UPLOAD_LIMIT = "file_upload_limit"
|
||||
SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions"
|
||||
MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk"
|
||||
DEFAULT_VOICE = "default_voice"
|
||||
WORD_LIMIT = "word_limit"
|
||||
AUDOI_TYPE = "audio_type"
|
||||
MAX_WORKERS = "max_workers"
|
||||
|
||||
|
||||
class ProviderModel(BaseModel):
|
||||
|
||||
42
api/core/model_runtime/model_providers/__base/tts_model.py
Normal file
42
api/core/model_runtime/model_providers/__base/tts_model.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from core.model_runtime.model_providers.__base.ai_model import AIModel
|
||||
|
||||
|
||||
class TTSModel(AIModel):
|
||||
"""
|
||||
Model class for ttstext model.
|
||||
"""
|
||||
model_type: ModelType = ModelType.TTS
|
||||
|
||||
def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: translated audio file
|
||||
"""
|
||||
try:
|
||||
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text)
|
||||
except Exception as e:
|
||||
raise self._transform_invoke_error(e)
|
||||
|
||||
@abstractmethod
|
||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: translated audio file
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -20,6 +20,7 @@ supported_model_types:
|
||||
- text-embedding
|
||||
- speech2text
|
||||
- moderation
|
||||
- tts
|
||||
configurate_methods:
|
||||
- predefined-model
|
||||
- customizable-model
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
model: tts-1-hd
|
||||
model_type: tts
|
||||
model_properties:
|
||||
default_voice: 'alloy'
|
||||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
@@ -0,0 +1,7 @@
|
||||
model: tts-1
|
||||
model_type: tts
|
||||
model_properties:
|
||||
default_voice: 'alloy'
|
||||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
235
api/core/model_runtime/model_providers/openai/tts/tts.py
Normal file
235
api/core/model_runtime/model_providers/openai/tts/tts.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import uuid
|
||||
import hashlib
|
||||
import subprocess
|
||||
from io import BytesIO
|
||||
from typing import Optional
|
||||
from functools import reduce
|
||||
from pydub import AudioSegment
|
||||
|
||||
from core.model_runtime.entities.model_entities import ModelPropertyKey
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
||||
|
||||
from typing_extensions import Literal
|
||||
from flask import Response, stream_with_context
|
||||
from openai import OpenAI
|
||||
import concurrent.futures
|
||||
|
||||
|
||||
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
||||
"""
|
||||
Model class for OpenAI Speech to text model.
|
||||
"""
|
||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||
"""
|
||||
_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
self._is_ffmpeg_installed()
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
if streaming:
|
||||
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||
credentials=credentials,
|
||||
content_text=content_text,
|
||||
user=user)),
|
||||
status=200, mimetype=f'audio/{audio_type}')
|
||||
else:
|
||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
||||
|
||||
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||
"""
|
||||
validate credentials text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
try:
|
||||
self._tts_invoke(
|
||||
model=model,
|
||||
credentials=credentials,
|
||||
content_text='Hello world!',
|
||||
user=user
|
||||
)
|
||||
except Exception as ex:
|
||||
raise CredentialsValidateFailedError(str(ex))
|
||||
|
||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||
"""
|
||||
_tts_invoke text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
max_workers = self._get_model_workers_limit(model, credentials)
|
||||
|
||||
try:
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
audio_bytes_list = list()
|
||||
|
||||
# Create a thread pool and map the function to the list of sentences
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence
|
||||
in sentences]
|
||||
for future in futures:
|
||||
try:
|
||||
audio_bytes_list.append(future.result())
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||
audio_bytes_list if audio_bytes]
|
||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||
buffer: BytesIO = BytesIO()
|
||||
combined_segment.export(buffer, format=audio_type)
|
||||
buffer.seek(0)
|
||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
# Todo: To improve the streaming function
|
||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||
"""
|
||||
_tts_invoke_streaming text2speech model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param user: unique user id
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
word_limit = self._get_model_word_limit(model, credentials)
|
||||
audio_type = self._get_model_audio_type(model, credentials)
|
||||
tts_file_id = self._get_file_name(content_text)
|
||||
file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}'
|
||||
try:
|
||||
client = OpenAI(**credentials_kwargs)
|
||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||
for sentence in sentences:
|
||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
||||
response.stream_to_file(file_path)
|
||||
except Exception as ex:
|
||||
raise InvokeBadRequestError(str(ex))
|
||||
|
||||
def _get_model_voice(self, model: str, credentials: dict) -> Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
|
||||
"""
|
||||
Get voice for given tts model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:return: voice
|
||||
"""
|
||||
model_schema = self.get_model_schema(model, credentials)
|
||||
|
||||
if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
|
||||
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
|
||||
|
||||
def _get_model_audio_type(self, model: str, credentials: dict) -> str:
|
||||
"""
|
||||
Get audio type for given tts model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:return: voice
|
||||
"""
|
||||
model_schema = self.get_model_schema(model, credentials)
|
||||
|
||||
if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
|
||||
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
|
||||
|
||||
def _get_model_word_limit(self, model: str, credentials: dict) -> int:
|
||||
"""
|
||||
Get audio type for given tts model
|
||||
:return: audio type
|
||||
"""
|
||||
model_schema = self.get_model_schema(model, credentials)
|
||||
|
||||
if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
|
||||
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
|
||||
|
||||
def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
|
||||
"""
|
||||
Get audio max workers for given tts model
|
||||
:return: audio type
|
||||
"""
|
||||
model_schema = self.get_model_schema(model, credentials)
|
||||
|
||||
if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
|
||||
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
|
||||
|
||||
@staticmethod
|
||||
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
|
||||
if delimiters is None:
|
||||
delimiters = set('。!?;\n')
|
||||
|
||||
buf = []
|
||||
word_count = 0
|
||||
for char in text:
|
||||
buf.append(char)
|
||||
if char in delimiters:
|
||||
if word_count >= limit:
|
||||
yield ''.join(buf)
|
||||
buf = []
|
||||
word_count = 0
|
||||
else:
|
||||
word_count += 1
|
||||
else:
|
||||
word_count += 1
|
||||
|
||||
if buf:
|
||||
yield ''.join(buf)
|
||||
|
||||
@staticmethod
|
||||
def _get_file_name(file_content: str) -> str:
|
||||
hash_object = hashlib.sha256(file_content.encode())
|
||||
hex_digest = hash_object.hexdigest()
|
||||
|
||||
namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
|
||||
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
|
||||
return str(unique_uuid)
|
||||
|
||||
def _process_sentence(self, sentence: str, model: str, credentials: dict):
|
||||
"""
|
||||
_tts_invoke openai text2speech model api
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param sentence: text content to be translated
|
||||
:return: text translated to audio file
|
||||
"""
|
||||
# transform credentials to kwargs for model instance
|
||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||
voice_name = self._get_model_voice(model, credentials)
|
||||
|
||||
client = OpenAI(**credentials_kwargs)
|
||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
||||
if isinstance(response.read(), bytes):
|
||||
return response.read()
|
||||
|
||||
@staticmethod
|
||||
def _is_ffmpeg_installed():
|
||||
try:
|
||||
output = subprocess.check_output("ffmpeg -version", shell=True)
|
||||
if "ffmpeg version" in output.decode("utf-8"):
|
||||
return True
|
||||
else:
|
||||
raise InvokeBadRequestError("ffmpeg is not installed")
|
||||
except Exception:
|
||||
raise InvokeBadRequestError("ffmpeg is not installed")
|
||||
@@ -19,6 +19,7 @@ model_config_fields = {
|
||||
'suggested_questions': fields.Raw(attribute='suggested_questions_list'),
|
||||
'suggested_questions_after_answer': fields.Raw(attribute='suggested_questions_after_answer_dict'),
|
||||
'speech_to_text': fields.Raw(attribute='speech_to_text_dict'),
|
||||
'text_to_speech': fields.Raw(attribute='text_to_speech_dict'),
|
||||
'retriever_resource': fields.Raw(attribute='retriever_resource_dict'),
|
||||
'annotation_reply': fields.Raw(attribute='annotation_reply_dict'),
|
||||
'more_like_this': fields.Raw(attribute='more_like_this_dict'),
|
||||
|
||||
32
api/migrations/versions/b24be59fbb04_.py
Normal file
32
api/migrations/versions/b24be59fbb04_.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""empty message
|
||||
|
||||
Revision ID: b24be59fbb04
|
||||
Revises: 187385f442fc
|
||||
Create Date: 2024-01-17 01:31:12.670556
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'b24be59fbb04'
|
||||
down_revision = 'de95f5c77138'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
|
||||
batch_op.add_column(sa.Column('text_to_speech', sa.Text(), nullable=True))
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
|
||||
batch_op.drop_column('text_to_speech')
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -146,6 +146,7 @@ class AppModelConfig(db.Model):
|
||||
suggested_questions = db.Column(db.Text)
|
||||
suggested_questions_after_answer = db.Column(db.Text)
|
||||
speech_to_text = db.Column(db.Text)
|
||||
text_to_speech = db.Column(db.Text)
|
||||
more_like_this = db.Column(db.Text)
|
||||
model = db.Column(db.Text)
|
||||
user_input_form = db.Column(db.Text)
|
||||
@@ -184,6 +185,11 @@ class AppModelConfig(db.Model):
|
||||
return json.loads(self.speech_to_text) if self.speech_to_text \
|
||||
else {"enabled": False}
|
||||
|
||||
@property
|
||||
def text_to_speech_dict(self) -> dict:
|
||||
return json.loads(self.text_to_speech) if self.text_to_speech \
|
||||
else {"enabled": False}
|
||||
|
||||
@property
|
||||
def retriever_resource_dict(self) -> dict:
|
||||
return json.loads(self.retriever_resource) if self.retriever_resource \
|
||||
@@ -263,6 +269,7 @@ class AppModelConfig(db.Model):
|
||||
"suggested_questions": self.suggested_questions_list,
|
||||
"suggested_questions_after_answer": self.suggested_questions_after_answer_dict,
|
||||
"speech_to_text": self.speech_to_text_dict,
|
||||
"text_to_speech": self.text_to_speech_dict,
|
||||
"retriever_resource": self.retriever_resource_dict,
|
||||
"annotation_reply": self.annotation_reply_dict,
|
||||
"more_like_this": self.more_like_this_dict,
|
||||
@@ -289,6 +296,8 @@ class AppModelConfig(db.Model):
|
||||
self.suggested_questions_after_answer = json.dumps(model_config['suggested_questions_after_answer'])
|
||||
self.speech_to_text = json.dumps(model_config['speech_to_text']) \
|
||||
if model_config.get('speech_to_text') else None
|
||||
self.text_to_speech = json.dumps(model_config['text_to_speech']) \
|
||||
if model_config.get('text_to_speech') else None
|
||||
self.more_like_this = json.dumps(model_config['more_like_this'])
|
||||
self.sensitive_word_avoidance = json.dumps(model_config['sensitive_word_avoidance']) \
|
||||
if model_config.get('sensitive_word_avoidance') else None
|
||||
@@ -323,6 +332,7 @@ class AppModelConfig(db.Model):
|
||||
suggested_questions=self.suggested_questions,
|
||||
suggested_questions_after_answer=self.suggested_questions_after_answer,
|
||||
speech_to_text=self.speech_to_text,
|
||||
text_to_speech=self.text_to_speech,
|
||||
more_like_this=self.more_like_this,
|
||||
sensitive_word_avoidance=self.sensitive_word_avoidance,
|
||||
external_data_tools=self.external_data_tools,
|
||||
|
||||
@@ -65,3 +65,4 @@ httpx[socks]~=0.24.1
|
||||
pydub~=0.25.1
|
||||
matplotlib~=3.8.2
|
||||
yfinance~=0.2.35
|
||||
pydub~=0.25.1
|
||||
@@ -86,13 +86,13 @@ class AccountService:
|
||||
db.session.commit()
|
||||
|
||||
return account
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_account_jwt_token(account):
|
||||
payload = {
|
||||
"user_id": account.id,
|
||||
"exp": datetime.utcnow() + timedelta(days=30),
|
||||
"iss": current_app.config['EDITION'],
|
||||
"iss": current_app.config['EDITION'],
|
||||
"sub": 'Console API Passport',
|
||||
}
|
||||
|
||||
@@ -345,7 +345,7 @@ class TenantService:
|
||||
}
|
||||
if action not in ['add', 'remove', 'update']:
|
||||
raise InvalidActionError("Invalid action.")
|
||||
|
||||
|
||||
if member:
|
||||
if operator.id == member.id:
|
||||
raise CannotOperateSelfError("Cannot operate self.")
|
||||
@@ -546,10 +546,10 @@ class RegisterService:
|
||||
return None
|
||||
|
||||
return {
|
||||
'account': account,
|
||||
'data': invitation_data,
|
||||
'tenant': tenant,
|
||||
}
|
||||
'account': account,
|
||||
'data': invitation_data,
|
||||
'tenant': tenant,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _get_invitation_by_token(cls, token: str, workspace_id: str, email: str) -> Optional[Dict[str, str]]:
|
||||
|
||||
@@ -95,6 +95,21 @@ class AppModelConfigService:
|
||||
if not isinstance(config["speech_to_text"]["enabled"], bool):
|
||||
raise ValueError("enabled in speech_to_text must be of boolean type")
|
||||
|
||||
# text_to_speech
|
||||
if 'text_to_speech' not in config or not config["text_to_speech"]:
|
||||
config["text_to_speech"] = {
|
||||
"enabled": False
|
||||
}
|
||||
|
||||
if not isinstance(config["text_to_speech"], dict):
|
||||
raise ValueError("text_to_speech must be of dict type")
|
||||
|
||||
if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]:
|
||||
config["text_to_speech"]["enabled"] = False
|
||||
|
||||
if not isinstance(config["text_to_speech"]["enabled"], bool):
|
||||
raise ValueError("enabled in text_to_speech must be of boolean type")
|
||||
|
||||
# return retriever resource
|
||||
if 'retriever_resource' not in config or not config["retriever_resource"]:
|
||||
config["retriever_resource"] = {
|
||||
@@ -317,6 +332,7 @@ class AppModelConfigService:
|
||||
"suggested_questions": config["suggested_questions"],
|
||||
"suggested_questions_after_answer": config["suggested_questions_after_answer"],
|
||||
"speech_to_text": config["speech_to_text"],
|
||||
"text_to_speech": config["text_to_speech"],
|
||||
"retriever_resource": config["retriever_resource"],
|
||||
"more_like_this": config["more_like_this"],
|
||||
"sensitive_word_avoidance": config["sensitive_word_avoidance"],
|
||||
|
||||
@@ -1,22 +1,26 @@
|
||||
import io
|
||||
from typing import Optional
|
||||
|
||||
from core.model_manager import ModelManager
|
||||
from core.model_runtime.entities.model_entities import ModelType
|
||||
from services.errors.audio import (AudioTooLargeServiceError, NoAudioUploadedServiceError,
|
||||
ProviderNotSupportSpeechToTextServiceError, UnsupportedAudioTypeServiceError)
|
||||
from services.errors.audio import (AudioTooLargeServiceError,
|
||||
NoAudioUploadedServiceError,
|
||||
ProviderNotSupportTextToSpeechServiceError,
|
||||
ProviderNotSupportSpeechToTextServiceError,
|
||||
UnsupportedAudioTypeServiceError)
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
FILE_SIZE = 15
|
||||
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
|
||||
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
|
||||
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
|
||||
|
||||
|
||||
class AudioService:
|
||||
@classmethod
|
||||
def transcript(cls, tenant_id: str, file: FileStorage):
|
||||
def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None):
|
||||
if file is None:
|
||||
raise NoAudioUploadedServiceError()
|
||||
|
||||
|
||||
extension = file.mimetype
|
||||
if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]:
|
||||
raise UnsupportedAudioTypeServiceError()
|
||||
@@ -33,8 +37,26 @@ class AudioService:
|
||||
tenant_id=tenant_id,
|
||||
model_type=ModelType.SPEECH2TEXT
|
||||
)
|
||||
if model_instance is None:
|
||||
raise ProviderNotSupportSpeechToTextServiceError()
|
||||
|
||||
buffer = io.BytesIO(file_content)
|
||||
buffer.name = 'temp.mp3'
|
||||
|
||||
return {"text": model_instance.invoke_speech2text(buffer)}
|
||||
return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
|
||||
|
||||
@classmethod
|
||||
def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None):
|
||||
model_manager = ModelManager()
|
||||
model_instance = model_manager.get_default_model_instance(
|
||||
tenant_id=tenant_id,
|
||||
model_type=ModelType.TTS
|
||||
)
|
||||
if model_instance is None:
|
||||
raise ProviderNotSupportTextToSpeechServiceError()
|
||||
|
||||
try:
|
||||
audio_response = model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
|
||||
return audio_response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
@@ -9,5 +9,10 @@ class AudioTooLargeServiceError(Exception):
|
||||
class UnsupportedAudioTypeServiceError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ProviderNotSupportSpeechToTextServiceError(Exception):
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
class ProviderNotSupportTextToSpeechServiceError(Exception):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user