transcribe_start.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """Start Amazon Transcribe job with language selection."""
  2. import os
  3. import boto3
  4. from shared import S3_BUCKET, update_job
  5. transcribe = boto3.client('transcribe')
  6. AUTO_DETECT_LANGUAGES = ['zh-CN', 'zh-TW', 'yue-CN', 'en-US']
  7. # Amazon Transcribe 语言代码映射
  8. LANG_MAP = {
  9. 'zh-CN': 'zh-CN',
  10. 'zh-TW': 'zh-TW',
  11. 'zh-HK': 'zh-HK',
  12. 'en-US': 'en-US',
  13. }
  14. def handler(event, context):
  15. job_id = event['job_id']
  16. audio_s3_key = event['audio_s3_key']
  17. language = event.get('language', 'auto')
  18. transcribe_job = f"sp-{job_id}"
  19. update_job(job_id, status='TRANSCRIBING')
  20. params = {
  21. 'TranscriptionJobName': transcribe_job,
  22. 'Media': {'MediaFileUri': f"s3://{S3_BUCKET}/{audio_s3_key}"},
  23. 'OutputBucketName': S3_BUCKET,
  24. 'OutputKey': f"jobs/{job_id}/transcribe-output.json",
  25. 'Settings': {
  26. 'ShowSpeakerLabels': True,
  27. 'MaxSpeakerLabels': 10,
  28. },
  29. }
  30. if language == 'auto':
  31. # 自动语言识别,提供候选语言列表
  32. params['IdentifyLanguage'] = True
  33. params['LanguageOptions'] = AUTO_DETECT_LANGUAGES
  34. else:
  35. # 指定语言
  36. params['LanguageCode'] = LANG_MAP.get(language, language)
  37. transcribe.start_transcription_job(**params)
  38. return {**event, 'transcribe_job': transcribe_job}