simplify_transcript.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/env python3
  2. """
  3. 简化 Amazon Transcribe JSON 输出,合并为按说话人分段的文本。
  4. 适合直接喂给 LLM 做会议总结。
  5. 用法: python simplify_transcript.py testjob.json [-o output.txt]
  6. """
  7. import json
  8. import sys
  9. import argparse
  10. def load_transcribe(path: str) -> dict:
  11. with open(path, 'r', encoding='utf-8') as f:
  12. return json.load(f)
  13. def simplify(data: dict) -> str:
  14. """将 Amazon Transcribe JSON 合并为 Speaker + 文本段落"""
  15. items = data['results']['items']
  16. # 构建段落: 按说话人变化分段
  17. paragraphs = []
  18. current_speaker = None
  19. current_text = []
  20. current_start = None
  21. for item in items:
  22. speaker = item.get('speaker_label', '')
  23. content = item['alternatives'][0]['content']
  24. item_type = item['type'] # pronunciation / punctuation
  25. # 标点符号不换说话人,直接追加
  26. if item_type == 'punctuation':
  27. if current_text:
  28. current_text.append(content)
  29. continue
  30. # 说话人变了,保存当前段落
  31. if speaker != current_speaker and current_text:
  32. paragraphs.append({
  33. 'speaker': current_speaker,
  34. 'start': current_start,
  35. 'text': ''.join(current_text).strip(),
  36. })
  37. current_text = []
  38. if speaker != current_speaker:
  39. current_speaker = speaker
  40. current_start = item.get('start_time', '')
  41. # 中文不加空格,英文/数字前加空格
  42. if current_text and not content.startswith((',', '。', '?', '!', '、', ':', ';')):
  43. # 判断是否需要空格 (前一个和当前都是 ASCII 才加)
  44. prev = current_text[-1] if current_text else ''
  45. if prev and prev[-1].isascii() and content[0].isascii():
  46. current_text.append(' ')
  47. current_text.append(content)
  48. # 最后一段
  49. if current_text:
  50. paragraphs.append({
  51. 'speaker': current_speaker,
  52. 'start': current_start,
  53. 'text': ''.join(current_text).strip(),
  54. })
  55. # 格式化输出
  56. lines = []
  57. speaker_count = len(set(p['speaker'] for p in paragraphs))
  58. lines.append(f"# 会议转录 ({speaker_count} 位发言人, {len(paragraphs)} 段)")
  59. lines.append("")
  60. for p in paragraphs:
  61. start = p['start']
  62. if start:
  63. secs = float(start)
  64. mins = int(secs // 60)
  65. s = int(secs % 60)
  66. ts = f"[{mins:02d}:{s:02d}]"
  67. else:
  68. ts = ""
  69. lines.append(f"**{p['speaker']}** {ts}")
  70. lines.append(p['text'])
  71. lines.append("")
  72. return '\n'.join(lines)
  73. def main():
  74. parser = argparse.ArgumentParser(description="简化 Amazon Transcribe JSON")
  75. parser.add_argument("input", help="Amazon Transcribe JSON 文件")
  76. parser.add_argument("-o", "--output", help="输出文件 (默认: 同名.txt)")
  77. args = parser.parse_args()
  78. data = load_transcribe(args.input)
  79. result = simplify(data)
  80. out_path = args.output or args.input.rsplit('.', 1)[0] + '_simplified.txt'
  81. with open(out_path, 'w', encoding='utf-8') as f:
  82. f.write(result)
  83. print(f"已保存: {out_path}")
  84. print(f"大小: {len(result)} 字符 (原始 JSON: {len(open(args.input, encoding='utf-8').read())} 字符)")
  85. print(f"压缩比: {len(result) / len(open(args.input, encoding='utf-8').read()) * 100:.1f}%")
  86. if __name__ == "__main__":
  87. main()