| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/env python3
- """
- 简化 Amazon Transcribe JSON 输出,合并为按说话人分段的文本。
- 适合直接喂给 LLM 做会议总结。
- 用法: python simplify_transcript.py testjob.json [-o output.txt]
- """
- import json
- import sys
- import argparse
- def load_transcribe(path: str) -> dict:
- with open(path, 'r', encoding='utf-8') as f:
- return json.load(f)
- def simplify(data: dict) -> str:
- """将 Amazon Transcribe JSON 合并为 Speaker + 文本段落"""
- items = data['results']['items']
-
- # 构建段落: 按说话人变化分段
- paragraphs = []
- current_speaker = None
- current_text = []
- current_start = None
-
- for item in items:
- speaker = item.get('speaker_label', '')
- content = item['alternatives'][0]['content']
- item_type = item['type'] # pronunciation / punctuation
-
- # 标点符号不换说话人,直接追加
- if item_type == 'punctuation':
- if current_text:
- current_text.append(content)
- continue
-
- # 说话人变了,保存当前段落
- if speaker != current_speaker and current_text:
- paragraphs.append({
- 'speaker': current_speaker,
- 'start': current_start,
- 'text': ''.join(current_text).strip(),
- })
- current_text = []
-
- if speaker != current_speaker:
- current_speaker = speaker
- current_start = item.get('start_time', '')
-
- # 中文不加空格,英文/数字前加空格
- if current_text and not content.startswith((',', '。', '?', '!', '、', ':', ';')):
- # 判断是否需要空格 (前一个和当前都是 ASCII 才加)
- prev = current_text[-1] if current_text else ''
- if prev and prev[-1].isascii() and content[0].isascii():
- current_text.append(' ')
- current_text.append(content)
-
- # 最后一段
- if current_text:
- paragraphs.append({
- 'speaker': current_speaker,
- 'start': current_start,
- 'text': ''.join(current_text).strip(),
- })
-
- # 格式化输出
- lines = []
- speaker_count = len(set(p['speaker'] for p in paragraphs))
- lines.append(f"# 会议转录 ({speaker_count} 位发言人, {len(paragraphs)} 段)")
- lines.append("")
-
- for p in paragraphs:
- start = p['start']
- if start:
- secs = float(start)
- mins = int(secs // 60)
- s = int(secs % 60)
- ts = f"[{mins:02d}:{s:02d}]"
- else:
- ts = ""
-
- lines.append(f"**{p['speaker']}** {ts}")
- lines.append(p['text'])
- lines.append("")
-
- return '\n'.join(lines)
- def main():
- parser = argparse.ArgumentParser(description="简化 Amazon Transcribe JSON")
- parser.add_argument("input", help="Amazon Transcribe JSON 文件")
- parser.add_argument("-o", "--output", help="输出文件 (默认: 同名.txt)")
- args = parser.parse_args()
-
- data = load_transcribe(args.input)
- result = simplify(data)
-
- out_path = args.output or args.input.rsplit('.', 1)[0] + '_simplified.txt'
- with open(out_path, 'w', encoding='utf-8') as f:
- f.write(result)
-
- print(f"已保存: {out_path}")
- print(f"大小: {len(result)} 字符 (原始 JSON: {len(open(args.input, encoding='utf-8').read())} 字符)")
- print(f"压缩比: {len(result) / len(open(args.input, encoding='utf-8').read()) * 100:.1f}%")
- if __name__ == "__main__":
- main()
|