download.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. """Download audio from SharePoint Stream and upload to S3."""
  2. import json
  3. import re
  4. import os
  5. import time
  6. import boto3
  7. from urllib.parse import unquote, urlparse, parse_qs
  8. from xml.etree import ElementTree as ET
  9. from shared import S3_BUCKET, update_job
  10. MAX_RETRIES = 5
  11. RETRY_BACKOFF = [1, 2, 4, 8, 16] # seconds
  12. s3 = boto3.client('s3')
  13. NS = {
  14. 'mpd': 'urn:mpeg:DASH:schema:MPD:2011',
  15. 'sea': 'urn:mpeg:dash:schema:sea:2012',
  16. }
  17. def handler(event, context):
  18. job_id = event['job_id']
  19. update_job(job_id, status='DOWNLOADING')
  20. # Load input
  21. obj = s3.get_object(Bucket=S3_BUCKET, Key=f"jobs/{job_id}/input.json")
  22. inp = json.loads(obj['Body'].read())
  23. curl_cmd = inp['curl']
  24. cookies_str = inp.get('cookies', '')
  25. language = inp.get('language', 'auto')
  26. # Parse cURL
  27. manifest_url, headers = parse_curl(curl_cmd)
  28. # Setup session
  29. import requests
  30. session = requests.Session()
  31. session.headers.update({
  32. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  33. })
  34. # Set cookies
  35. if cookies_str:
  36. sp_domain = extract_sp_domain(manifest_url)
  37. cookies_str = cookies_str.strip()
  38. # 支持 Cookie-Editor JSON 数组格式
  39. if cookies_str.startswith('['):
  40. try:
  41. cookie_list = json.loads(cookies_str)
  42. for c in cookie_list:
  43. name = c.get('name', '')
  44. value = c.get('value', '')
  45. domain = c.get('domain', sp_domain)
  46. if domain.startswith('.'):
  47. domain = domain[1:]
  48. if name and value:
  49. session.cookies.set(name, value, domain=domain)
  50. except json.JSONDecodeError:
  51. pass
  52. else:
  53. # name=value; name=value 格式
  54. for pair in cookies_str.split('; '):
  55. if '=' in pair:
  56. name, value = pair.split('=', 1)
  57. session.cookies.set(name.strip(), value.strip(), domain=sp_domain)
  58. # Fetch manifest
  59. svc_headers = {}
  60. for k, v in headers.items():
  61. if k.lower() in ('x-spopactoken', 'origin', 'referer'):
  62. svc_headers[k] = v
  63. resp = fetch_with_retry(session, manifest_url, svc_headers)
  64. manifest_xml = resp.text
  65. # Parse manifest - find audio track
  66. root = ET.fromstring(manifest_xml)
  67. base_url_el = root.find('mpd:BaseURL', NS)
  68. base_url = base_url_el.text.strip() if base_url_el is not None else ''
  69. period = root.find('mpd:Period', NS)
  70. audio_track = None
  71. for adapt in period.findall('mpd:AdaptationSet', NS):
  72. if adapt.get('contentType') == 'audio':
  73. label_el = adapt.find('mpd:Label', NS)
  74. label = label_el.text if label_el is not None else ''
  75. if label == 'OriginalAudio' or audio_track is None:
  76. audio_track = adapt
  77. if label == 'OriginalAudio':
  78. break
  79. if audio_track is None:
  80. raise Exception("No audio track found in manifest")
  81. # Get encryption key
  82. cp = audio_track.find('mpd:ContentProtection', NS)
  83. enc_key = None
  84. enc_iv = None
  85. if cp is not None:
  86. crypto = cp.find('sea:CryptoPeriod', NS)
  87. if crypto is not None:
  88. key_url = crypto.get('keyUriTemplate', '').replace('&', '&')
  89. iv_str = crypto.get('IV', '')
  90. if key_url:
  91. kr = fetch_with_retry(session, key_url, svc_headers)
  92. enc_key = kr.content
  93. if iv_str.startswith('0x'):
  94. iv_str = iv_str[2:]
  95. enc_iv = bytes.fromhex(iv_str) if iv_str else None
  96. # Parse segments
  97. seg_tpl = audio_track.find('mpd:SegmentTemplate', NS)
  98. init_tpl = seg_tpl.get('initialization', '').replace('&', '&')
  99. media_tpl = seg_tpl.get('media', '').replace('&', '&')
  100. rep = audio_track.find('mpd:Representation', NS)
  101. rep_id = rep.get('id', '')
  102. segments = []
  103. timeline = seg_tpl.find('mpd:SegmentTimeline', NS)
  104. t = 0
  105. for s_el in timeline.findall('mpd:S', NS):
  106. d = int(s_el.get('d', 0))
  107. r = int(s_el.get('r', 0))
  108. for _ in range(r + 1):
  109. segments.append(t)
  110. t += d
  111. # Download init + segments
  112. sp_headers = {}
  113. for k, v in headers.items():
  114. if k.lower() in ('origin', 'referer'):
  115. sp_headers[k] = v
  116. audio_data = bytearray()
  117. # Init segment
  118. init_url = resolve_url(base_url, init_tpl, rep_id)
  119. r = fetch_with_retry(session, init_url, sp_headers)
  120. audio_data.extend(decrypt(r.content, enc_key, enc_iv))
  121. # Media segments
  122. total = len(segments)
  123. for idx, seg_time in enumerate(segments):
  124. url = resolve_url(base_url, media_tpl, rep_id, seg_time)
  125. r = fetch_with_retry(session, url, sp_headers)
  126. audio_data.extend(decrypt(r.content, enc_key, enc_iv))
  127. if (idx + 1) % 50 == 0:
  128. print(f"[download] progress: {idx+1}/{total} segments")
  129. # Upload to S3
  130. s3_key = f"jobs/{job_id}/audio.mp4"
  131. s3.put_object(Bucket=S3_BUCKET, Key=s3_key, Body=bytes(audio_data), ContentType='audio/mp4')
  132. update_job(job_id, status='DOWNLOADED', audio_s3_key=s3_key)
  133. return {**event, 'audio_s3_key': s3_key, 'language': language}
  134. def fetch_with_retry(session, url, headers):
  135. """GET with retry on 5xx / connection errors."""
  136. import requests
  137. for attempt in range(MAX_RETRIES):
  138. try:
  139. r = session.get(url, headers=headers, timeout=60)
  140. if r.status_code < 500:
  141. r.raise_for_status()
  142. return r
  143. # 5xx: retry
  144. print(f"[download] HTTP {r.status_code}, retry {attempt+1}/{MAX_RETRIES}")
  145. except (requests.ConnectionError, requests.Timeout) as e:
  146. print(f"[download] {type(e).__name__}, retry {attempt+1}/{MAX_RETRIES}")
  147. if attempt < MAX_RETRIES - 1:
  148. time.sleep(RETRY_BACKOFF[attempt])
  149. # Last attempt — let it raise
  150. r = session.get(url, headers=headers, timeout=60)
  151. r.raise_for_status()
  152. return r
  153. def parse_curl(curl_str):
  154. import shlex
  155. curl_str = curl_str.replace('\\\n', ' ').replace('\\\r\n', ' ')
  156. tokens = shlex.split(curl_str, posix=True)
  157. url = None
  158. headers = {}
  159. i = 0
  160. while i < len(tokens):
  161. t = tokens[i]
  162. if t in ('-H', '--header') and i + 1 < len(tokens):
  163. i += 1
  164. hdr = tokens[i]
  165. if ':' in hdr:
  166. k, v = hdr.split(':', 1)
  167. headers[k.strip()] = v.strip()
  168. elif t.startswith('http'):
  169. url = t
  170. elif t in ('-X', '--request') and i + 1 < len(tokens):
  171. i += 1
  172. i += 1
  173. if not url:
  174. m = re.search(r"(https?://[^\s'\"]+)", curl_str)
  175. if m:
  176. url = m.group(1)
  177. return url, headers
  178. def extract_sp_domain(manifest_url):
  179. qs = parse_qs(urlparse(manifest_url).query)
  180. docid = unquote(qs.get('docid', [''])[0])
  181. if docid:
  182. return urlparse(docid).hostname
  183. return ''
  184. def resolve_url(base_url, template, rep_id, seg_time=None):
  185. url = template
  186. url = re.sub(r'\$RepresentationID[^&]*amp;', rep_id, url)
  187. url = re.sub(r'\$RepresentationID\$', rep_id, url)
  188. if seg_time is not None:
  189. url = re.sub(r'\$Time[^&]*amp;', str(seg_time), url)
  190. url = re.sub(r'\$Time\$', str(seg_time), url)
  191. return base_url + url
  192. def decrypt(data, key, iv):
  193. if not key:
  194. return data
  195. from Crypto.Cipher import AES
  196. cipher = AES.new(key, AES.MODE_CBC, iv)
  197. dec = cipher.decrypt(data)
  198. if len(dec) > 0:
  199. pad = dec[-1]
  200. if 0 < pad <= 16 and dec[-pad:] == bytes([pad]) * pad:
  201. dec = dec[:-pad]
  202. return dec