| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- """Start Amazon Transcribe job with language selection."""
- import os
- import boto3
- from shared import S3_BUCKET, update_job
- transcribe = boto3.client('transcribe')
- AUTO_DETECT_LANGUAGES = ['zh-CN', 'zh-TW', 'yue-CN', 'en-US']
- # Amazon Transcribe 语言代码映射
- LANG_MAP = {
- 'zh-CN': 'zh-CN',
- 'zh-TW': 'zh-TW',
- 'zh-HK': 'zh-HK',
- 'en-US': 'en-US',
- }
- def handler(event, context):
- job_id = event['job_id']
- audio_s3_key = event['audio_s3_key']
- language = event.get('language', 'auto')
- transcribe_job = f"sp-{job_id}"
- update_job(job_id, status='TRANSCRIBING')
- params = {
- 'TranscriptionJobName': transcribe_job,
- 'Media': {'MediaFileUri': f"s3://{S3_BUCKET}/{audio_s3_key}"},
- 'OutputBucketName': S3_BUCKET,
- 'OutputKey': f"jobs/{job_id}/transcribe-output.json",
- 'Settings': {
- 'ShowSpeakerLabels': True,
- 'MaxSpeakerLabels': 10,
- },
- }
- if language == 'auto':
- # 自动语言识别,提供候选语言列表
- params['IdentifyLanguage'] = True
- params['LanguageOptions'] = AUTO_DETECT_LANGUAGES
- else:
- # 指定语言
- params['LanguageCode'] = LANG_MAP.get(language, language)
- transcribe.start_transcription_job(**params)
- return {**event, 'transcribe_job': transcribe_job}
|