sp_video_dl.py 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. #!/usr/bin/env python3
  2. """
  3. SharePoint Stream 视频下载器
  4. 用法:
  5. 1. 浏览器 DevTools -> Network -> 找到 videomanifest 请求
  6. 2. 右键 -> Copy as cURL (bash)
  7. 3. 粘贴到 curl_command.txt 文件中
  8. 4. 运行: python sp_video_dl.py curl_command.txt
  9. 或者直接运行 python sp_video_dl.py 进入交互模式
  10. 依赖: pip install requests pycryptodome
  11. """
  12. import re
  13. import os
  14. import sys
  15. import json
  16. import shlex
  17. import struct
  18. import argparse
  19. import traceback
  20. from io import BytesIO
  21. from urllib.parse import unquote, urljoin, urlparse, parse_qs
  22. from xml.etree import ElementTree as ET
  23. from concurrent.futures import ThreadPoolExecutor, as_completed
  24. try:
  25. import requests
  26. except ImportError:
  27. print("需要安装 requests: pip install requests")
  28. sys.exit(1)
  29. try:
  30. from Crypto.Cipher import AES
  31. except ImportError:
  32. AES = None
  33. print("[警告] 未安装 pycryptodome,将跳过解密步骤")
  34. print(" 安装: pip install pycryptodome")
  35. # ============================================================
  36. # cURL 解析
  37. # ============================================================
  38. def parse_curl(curl_str: str) -> dict:
  39. """解析 cURL 命令,提取 URL 和 headers"""
  40. curl_str = curl_str.strip()
  41. # 处理多行 (反斜杠换行)
  42. curl_str = curl_str.replace("^\n", " ").replace("\\\n", " ").replace("^\r\n", " ").replace("\\\r\n", " ")
  43. # 尝试用 shlex 分词
  44. try:
  45. tokens = shlex.split(curl_str, posix=True)
  46. except ValueError:
  47. # Windows 的 PowerShell cURL 可能有不同的引号
  48. curl_str = curl_str.replace("'", '"')
  49. tokens = shlex.split(curl_str, posix=True)
  50. url = None
  51. headers = {}
  52. method = "GET"
  53. i = 0
  54. while i < len(tokens):
  55. t = tokens[i]
  56. if t.lower() == "curl":
  57. i += 1
  58. continue
  59. elif t in ("-H", "--header"):
  60. i += 1
  61. if i < len(tokens):
  62. hdr = tokens[i]
  63. if ":" in hdr:
  64. k, v = hdr.split(":", 1)
  65. headers[k.strip()] = v.strip()
  66. elif t in ("-X", "--request"):
  67. i += 1
  68. if i < len(tokens):
  69. method = tokens[i].upper()
  70. elif t in ("--compressed", "-k", "--insecure"):
  71. pass
  72. elif t.startswith("http"):
  73. url = t
  74. elif not t.startswith("-") and url is None:
  75. if t.startswith("http"):
  76. url = t
  77. i += 1
  78. # 如果没找到 URL,尝试正则
  79. if not url:
  80. m = re.search(r"(https?://[^\s'\"]+)", curl_str)
  81. if m:
  82. url = m.group(1).rstrip("'\"")
  83. if not url:
  84. raise ValueError("无法从 cURL 命令中提取 URL")
  85. return {"url": url, "headers": headers, "method": method}
  86. def parse_raw_headers(text: str) -> dict:
  87. """解析原始 HTTP headers 文本"""
  88. headers = {}
  89. url = None
  90. for line in text.strip().splitlines():
  91. line = line.strip()
  92. if not line:
  93. continue
  94. # 检查是否是 URL
  95. if line.startswith("http"):
  96. url = line
  97. continue
  98. # 检查是否是 header
  99. if ":" in line:
  100. k, v = line.split(":", 1)
  101. k = k.strip()
  102. v = v.strip()
  103. # 跳过伪 headers
  104. if not k.startswith(":"):
  105. headers[k] = v
  106. elif k == ":authority":
  107. headers["Host"] = v
  108. return {"url": url, "headers": headers}
  109. # ============================================================
  110. # DASH Manifest 解析
  111. # ============================================================
  112. NS = {
  113. "mpd": "urn:mpeg:DASH:schema:MPD:2011",
  114. "sea": "urn:mpeg:dash:schema:sea:2012",
  115. }
  116. def parse_manifest(xml_text: str) -> dict:
  117. """解析 DASH MPD manifest"""
  118. root = ET.fromstring(xml_text)
  119. info = {
  120. "duration_str": root.get("mediaPresentationDuration", ""),
  121. "duration_sec": 0,
  122. "base_url": "",
  123. "tracks": [],
  124. }
  125. # 解析时长 PT0H0M2555.584S
  126. dur_m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:([\d.]+)S)?", info["duration_str"])
  127. if dur_m:
  128. h = int(dur_m.group(1) or 0)
  129. m = int(dur_m.group(2) or 0)
  130. s = float(dur_m.group(3) or 0)
  131. info["duration_sec"] = h * 3600 + m * 60 + s
  132. base_el = root.find("mpd:BaseURL", NS)
  133. if base_el is not None and base_el.text:
  134. info["base_url"] = base_el.text.strip()
  135. period = root.find("mpd:Period", NS)
  136. if period is None:
  137. return info
  138. for adapt in period.findall("mpd:AdaptationSet", NS):
  139. track = {
  140. "id": adapt.get("id"),
  141. "type": adapt.get("contentType"), # audio / video
  142. "mime": adapt.get("mimeType"),
  143. "codecs": adapt.get("codecs"),
  144. "label": "",
  145. "width": adapt.get("maxWidth"),
  146. "height": adapt.get("maxHeight"),
  147. "key_url": "",
  148. "iv": "",
  149. "init_tpl": "",
  150. "media_tpl": "",
  151. "timescale": 0,
  152. "segments": [], # list of segment start times
  153. "representations": [],
  154. }
  155. label_el = adapt.find("mpd:Label", NS)
  156. if label_el is not None and label_el.text:
  157. track["label"] = label_el.text
  158. # 加密
  159. cp = adapt.find("mpd:ContentProtection", NS)
  160. if cp is not None:
  161. crypto = cp.find("sea:CryptoPeriod", NS)
  162. if crypto is not None:
  163. track["key_url"] = crypto.get("keyUriTemplate", "").replace("&amp;", "&")
  164. track["iv"] = crypto.get("IV", "")
  165. # SegmentTemplate
  166. seg_tpl = adapt.find("mpd:SegmentTemplate", NS)
  167. if seg_tpl is not None:
  168. track["init_tpl"] = seg_tpl.get("initialization", "").replace("&amp;", "&")
  169. track["media_tpl"] = seg_tpl.get("media", "").replace("&amp;", "&")
  170. track["timescale"] = int(seg_tpl.get("timescale", "1"))
  171. timeline = seg_tpl.find("mpd:SegmentTimeline", NS)
  172. if timeline is not None:
  173. t = 0
  174. for s in timeline.findall("mpd:S", NS):
  175. d = int(s.get("d", 0))
  176. r = int(s.get("r", 0))
  177. for _ in range(r + 1):
  178. track["segments"].append(t)
  179. t += d
  180. for rep in adapt.findall("mpd:Representation", NS):
  181. track["representations"].append({
  182. "id": rep.get("id"),
  183. "bandwidth": int(rep.get("bandwidth", 0)),
  184. "width": rep.get("width"),
  185. "height": rep.get("height"),
  186. "codecs": rep.get("codecs"),
  187. })
  188. info["tracks"].append(track)
  189. return info
  190. def resolve_url(base_url: str, template: str, rep_id: str, seg_time: int = None) -> str:
  191. """替换 DASH 模板中的占位符"""
  192. url = template
  193. # SharePoint 的模板占位符格式比较特殊,包含 GUID 后缀
  194. # $RepresentationIDe963038e-16ab-4be8-8c6d-17d09a407520amp;
  195. # $Timee963038e-16ab-4be8-8c6d-17d09a407520amp;
  196. url = re.sub(r'\$RepresentationID[^&]*amp;', rep_id, url)
  197. url = re.sub(r'\$RepresentationID\$', rep_id, url)
  198. if seg_time is not None:
  199. url = re.sub(r'\$Time[^&]*amp;', str(seg_time), url)
  200. url = re.sub(r'\$Time\$', str(seg_time), url)
  201. full = base_url + url
  202. return full
  203. # ============================================================
  204. # 下载器
  205. # ============================================================
  206. class SPVideoDownloader:
  207. def __init__(self, manifest_url: str, headers: dict, output_dir: str = "output",
  208. cookies: str = None):
  209. self.manifest_url = manifest_url
  210. self.headers = headers
  211. self.output_dir = output_dir
  212. self.session = requests.Session()
  213. self.session.headers.update({
  214. "User-Agent": headers.get("user-agent", headers.get("User-Agent",
  215. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")),
  216. })
  217. # 提取关键 headers
  218. self.pac_token = None
  219. for k, v in headers.items():
  220. if k.lower() == "x-spopactoken":
  221. self.pac_token = v
  222. break
  223. self.origin = headers.get("origin", headers.get("Origin", ""))
  224. self.referer = headers.get("referer", headers.get("Referer", ""))
  225. self.manifest_info = None
  226. self.encryption_key = None
  227. self.encryption_iv = None
  228. self.sp_access_token = None
  229. # 设置 cookies (SharePoint 用 FedAuth/rtFa HttpOnly cookie 认证)
  230. if cookies:
  231. self._set_cookies(cookies)
  232. def _set_cookies(self, cookie_str: str):
  233. """解析并设置 cookies
  234. 支持格式:
  235. 1. JSON 文件路径 (Cookie-Editor / EditThisCookie 导出)
  236. 2. "name1=value1; name2=value2" (Cookie header 格式)
  237. """
  238. cookie_str = cookie_str.strip()
  239. # 从 manifest URL 提取 SharePoint domain
  240. sp_domain = ""
  241. parsed_manifest = urlparse(self.manifest_url)
  242. # manifest 是 mediap.svc.ms,但 segment 去 sharepoint.com
  243. # 从 docid 参数提取真实 domain
  244. qs = parse_qs(parsed_manifest.query)
  245. docid = qs.get('docid', [''])[0]
  246. if docid:
  247. docid_parsed = urlparse(unquote(docid))
  248. sp_domain = docid_parsed.hostname or ""
  249. if not sp_domain:
  250. sp_domain = "ecvcorp-my.sharepoint.com" # fallback
  251. print(f" Cookie domain: {sp_domain}")
  252. # 尝试作为 JSON 文件
  253. if os.path.isfile(cookie_str):
  254. with open(cookie_str, 'r', encoding='utf-8') as f:
  255. data = json.load(f)
  256. # Cookie-Editor 导出格式: [{name, value, domain, ...}, ...]
  257. if isinstance(data, list):
  258. for c in data:
  259. name = c.get('name', '')
  260. value = c.get('value', '')
  261. domain = c.get('domain', sp_domain)
  262. if domain.startswith('.'):
  263. domain = domain[1:]
  264. if name and value:
  265. self.session.cookies.set(name, value, domain=domain)
  266. print(f" 从 JSON 加载了 {len(data)} 个 cookies")
  267. # Netscape 格式或其他 dict 格式
  268. elif isinstance(data, dict):
  269. for name, value in data.items():
  270. self.session.cookies.set(name, str(value), domain=sp_domain)
  271. print(f" 从 JSON 加载了 {len(data)} 个 cookies")
  272. else:
  273. # Cookie header 格式: name1=value1; name2=value2
  274. # 注意: FedAuth 值很长且可能包含 base64 的 = 和 +
  275. # 只按 "; " (分号+空格) 分割,避免误切 base64 里的 =
  276. parts = cookie_str.split('; ')
  277. for part in parts:
  278. part = part.strip()
  279. if '=' in part:
  280. name, value = part.split('=', 1)
  281. self.session.cookies.set(name.strip(), value.strip(), domain=sp_domain)
  282. # 打印调试信息
  283. cookie_names = [c.name for c in self.session.cookies]
  284. print(f" 已设置 {len(cookie_names)} 个 cookies: {', '.join(cookie_names)}")
  285. has_fedauth = any(c.name == 'FedAuth' for c in self.session.cookies)
  286. has_rtfa = any(c.name == 'rtFa' for c in self.session.cookies)
  287. if not has_fedauth:
  288. print(" [警告] 缺少 FedAuth cookie - 这是 SharePoint 的主要认证 cookie")
  289. if not has_rtfa:
  290. print(" [警告] 缺少 rtFa cookie")
  291. def _svc_headers(self) -> dict:
  292. """mediap.svc.ms 请求的 headers"""
  293. h = {}
  294. if self.pac_token:
  295. h["x-spopactoken"] = self.pac_token
  296. if self.origin:
  297. h["Origin"] = self.origin
  298. if self.referer:
  299. h["Referer"] = self.referer
  300. return h
  301. def _sp_headers(self) -> dict:
  302. """SharePoint segment 下载的 headers
  303. segment URL 通过 P1/P4 签名参数自认证,不需要额外 token。
  304. 浏览器请求也没有 cookie 或 Authorization header。
  305. """
  306. h = {}
  307. if self.origin:
  308. h["Origin"] = self.origin
  309. if self.referer:
  310. h["Referer"] = self.referer
  311. return h
  312. def fetch_manifest(self, quiet: bool = False) -> dict:
  313. """获取并解析 DASH manifest"""
  314. if not quiet:
  315. print("[1/5] 获取 DASH manifest...")
  316. resp = self.session.get(self.manifest_url, headers=self._svc_headers())
  317. if resp.status_code == 401:
  318. print(f"\n[错误] 401 Unauthorized - 认证失败")
  319. print(" 可能原因:")
  320. print(" 1. 复制了 OPTIONS preflight 请求 (没有 x-spopactoken)")
  321. print(" 2. Token 已过期 (需要重新从浏览器复制)")
  322. print(" 3. 没有访问权限")
  323. print(f"\n 当前 PAC Token: {'有' if self.pac_token else '无'}")
  324. if self.pac_token:
  325. print(f" Token 前缀: {self.pac_token[:40]}...")
  326. sys.exit(1)
  327. resp.raise_for_status()
  328. xml_text = resp.text
  329. self.manifest_info = parse_manifest(xml_text)
  330. dur = self.manifest_info["duration_sec"]
  331. h, m, s = int(dur // 3600), int((dur % 3600) // 60), dur % 60
  332. if not quiet:
  333. print(f" 时长: {h:02d}:{m:02d}:{s:05.2f}")
  334. print(f" BaseURL: {self.manifest_info['base_url'][:80]}...")
  335. for track in self.manifest_info["tracks"]:
  336. t = track["type"]
  337. label = f" ({track['label']})" if track["label"] else ""
  338. segs = len(track["segments"])
  339. res = f" {track['width']}x{track['height']}" if track["width"] else ""
  340. reps = ", ".join(r["id"] for r in track["representations"])
  341. print(f" Track {track['id']}: {t}{label}{res} | {segs} segments | reps: {reps}")
  342. # 保存 manifest
  343. os.makedirs(self.output_dir, exist_ok=True)
  344. with open(os.path.join(self.output_dir, "manifest.mpd"), "w", encoding="utf-8") as f:
  345. f.write(xml_text)
  346. return self.manifest_info
  347. def _renew_manifest(self, track_type: str) -> dict:
  348. """重新获取 manifest 以刷新签名 URL"""
  349. print(" [续签] 重新获取 manifest...")
  350. self.fetch_manifest(quiet=True)
  351. # 返回对应类型的 track
  352. for track in self.manifest_info["tracks"]:
  353. if track["type"] == track_type:
  354. if track_type == "audio" and track["label"] != "OriginalAudio":
  355. continue
  356. return track
  357. # fallback
  358. for track in self.manifest_info["tracks"]:
  359. if track["type"] == track_type:
  360. return track
  361. return None
  362. def fetch_encryption_key(self, track: dict) -> bytes:
  363. """Step 2: 获取 AES-128 解密密钥"""
  364. if not track["key_url"]:
  365. print(" [跳过] 无加密")
  366. return None
  367. print("[2/5] 获取解密密钥...")
  368. resp = self.session.get(track["key_url"], headers=self._svc_headers())
  369. resp.raise_for_status()
  370. key = resp.content
  371. print(f" Key: {key.hex()} ({len(key)} bytes)")
  372. # 解析 IV
  373. iv_str = track["iv"]
  374. if iv_str.startswith("0x"):
  375. iv_str = iv_str[2:]
  376. self.encryption_iv = bytes.fromhex(iv_str)
  377. self.encryption_key = key
  378. print(f" IV: {self.encryption_iv.hex()}")
  379. return key
  380. def decrypt_segment(self, data: bytes) -> bytes:
  381. """AES-128-CBC 解密"""
  382. if not self.encryption_key or AES is None:
  383. return data
  384. cipher = AES.new(self.encryption_key, AES.MODE_CBC, self.encryption_iv)
  385. decrypted = cipher.decrypt(data)
  386. # PKCS7 unpadding
  387. if len(decrypted) > 0:
  388. pad = decrypted[-1]
  389. if 0 < pad <= 16 and decrypted[-pad:] == bytes([pad]) * pad:
  390. decrypted = decrypted[:-pad]
  391. return decrypted
  392. def download_track(self, track: dict, track_name: str, max_workers: int = 4) -> str:
  393. """下载一个 track 的所有 segments 并合并
  394. SharePoint 用 FedAuth/rtFa HttpOnly cookie 认证 segment 请求。
  395. 没有 cookie 时,URL 签名只能支撑约 50-70 个请求。
  396. 有 cookie 时可以全速下载。
  397. """
  398. import time as _time
  399. from concurrent.futures import ThreadPoolExecutor, as_completed
  400. base_url = self.manifest_info["base_url"]
  401. rep = track["representations"][0]
  402. rep_id = rep["id"]
  403. segments = track["segments"]
  404. total = len(segments)
  405. has_cookies = len(self.session.cookies) > 0
  406. out_path = os.path.join(self.output_dir, f"{track_name}.mp4")
  407. # 下载 init segment
  408. init_url = resolve_url(base_url, track["init_tpl"], rep_id)
  409. print(f" 下载 init segment...")
  410. # 调试: 检查 cookie 是否会被发送到这个 URL
  411. if has_cookies:
  412. matched = self.session.cookies.get_dict(domain=urlparse(init_url).hostname)
  413. print(f" [调试] 匹配到 {len(matched)} 个 cookies for {urlparse(init_url).hostname}")
  414. if not matched:
  415. print(f" [调试] Cookie jar 内容:")
  416. for c in self.session.cookies:
  417. print(f" {c.name} domain={c.domain} path={c.path}")
  418. resp = self.session.get(init_url, headers=self._sp_headers())
  419. # 调试: 检查实际发送的 cookie header
  420. if has_cookies:
  421. sent_cookie = resp.request.headers.get('Cookie', '')
  422. if sent_cookie:
  423. print(f" [调试] 实际发送 Cookie: {sent_cookie[:80]}...")
  424. else:
  425. print(f" [调试] 未发送任何 Cookie!")
  426. if resp.status_code == 401:
  427. print(f" init segment 401")
  428. if not has_cookies:
  429. print(" [提示] 缺少 FedAuth cookie,请用 -c 参数提供")
  430. print(" 在浏览器 Console 执行: copy(document.cookie)")
  431. print(" 然后: python sp_video_dl.py curl_command.txt -c \"粘贴的cookie\"")
  432. sys.exit(1)
  433. resp.raise_for_status()
  434. init_data = self.decrypt_segment(resp.content)
  435. print(f" 下载 {total} 个 media segments...")
  436. if has_cookies:
  437. # 有 cookie,可以并发下载
  438. workers = max_workers
  439. print(f" 模式: 并发 (workers={workers})")
  440. else:
  441. # 没有 cookie,串行 + 限速
  442. workers = 1
  443. print(f" 模式: 串行 (无 cookie,可能在 ~50 个后 401)")
  444. seg_data_map = {}
  445. def dl_one(idx, seg_time):
  446. url = resolve_url(base_url, track["media_tpl"], rep_id, seg_time)
  447. for attempt in range(3):
  448. r = self.session.get(url, headers=self._sp_headers())
  449. if r.status_code == 200:
  450. return idx, self.decrypt_segment(r.content)
  451. elif r.status_code in (429, 503):
  452. _time.sleep(int(r.headers.get("Retry-After", 5)))
  453. elif r.status_code == 401:
  454. _time.sleep(2)
  455. else:
  456. r.raise_for_status()
  457. # 最后一次
  458. r = self.session.get(url, headers=self._sp_headers())
  459. r.raise_for_status()
  460. return idx, self.decrypt_segment(r.content)
  461. downloaded = 0
  462. failed = 0
  463. with ThreadPoolExecutor(max_workers=workers) as pool:
  464. futures = {pool.submit(dl_one, i, t): i for i, t in enumerate(segments)}
  465. for future in as_completed(futures):
  466. try:
  467. idx, data = future.result()
  468. seg_data_map[idx] = data
  469. downloaded += 1
  470. if downloaded % 50 == 0 or downloaded == total:
  471. pct = downloaded / total * 100
  472. print(f" 进度: {downloaded}/{total} ({pct:.1f}%)")
  473. except Exception as e:
  474. failed += 1
  475. err = str(e)
  476. if "401" in err and not has_cookies:
  477. # 取消剩余
  478. for f in futures:
  479. f.cancel()
  480. print(f"\n 已下载 {downloaded}/{total} 后遇到 401")
  481. print(f" 根本原因: 缺少 SharePoint 认证 cookie (FedAuth/rtFa)")
  482. print(f" HAR 导出不包含 HttpOnly cookie,需要手动获取")
  483. print(f"\n 解决方案:")
  484. print(f" 1. 在浏览器 Console 执行: copy(document.cookie)")
  485. print(f" 2. 运行: python sp_video_dl.py curl_command.txt -c \"粘贴\"")
  486. print(f"\n 如果 document.cookie 为空 (HttpOnly),安装浏览器扩展:")
  487. print(f" - EditThisCookie 或 Cookie-Editor")
  488. print(f" - 导出 ecvcorp-my.sharepoint.com 的所有 cookie 为 JSON")
  489. print(f" - 运行: python sp_video_dl.py curl_command.txt -c cookies.json")
  490. sys.exit(1)
  491. if failed >= 5:
  492. for f in futures:
  493. f.cancel()
  494. print(f"\n[错误] 失败 {failed} 次: {err[:200]}")
  495. sys.exit(1)
  496. # 写入文件
  497. print(f" 写入 {out_path}...")
  498. with open(out_path, "wb") as f:
  499. f.write(init_data)
  500. for i in range(total):
  501. if i in seg_data_map:
  502. f.write(seg_data_map[i])
  503. size_mb = os.path.getsize(out_path) / 1024 / 1024
  504. print(f" 完成: {size_mb:.1f} MB")
  505. return out_path
  506. def select_tracks(self) -> tuple:
  507. """选择要下载的视频和音频 track"""
  508. video_track = None
  509. audio_track = None
  510. for track in self.manifest_info["tracks"]:
  511. if track["type"] == "video" and video_track is None:
  512. video_track = track
  513. elif track["type"] == "audio":
  514. # 优先选 OriginalAudio
  515. if track["label"] == "OriginalAudio":
  516. audio_track = track
  517. elif audio_track is None:
  518. audio_track = track
  519. return video_track, audio_track
  520. def fetch_transcript(self) -> str:
  521. """下载转录/字幕文件
  522. 流程:
  523. 1. 从 manifest URL 的 docid 参数提取 item API URL
  524. 2. 请求转录元数据获取转录列表
  525. 3. 下载转录内容 (实际为 WebVTT 格式)
  526. 4. 转换为 SRT 和纯文本
  527. """
  528. print("\n[额外] 下载转录...")
  529. parsed = urlparse(self.manifest_url)
  530. qs = parse_qs(parsed.query)
  531. docid = unquote(qs.get('docid', [''])[0])
  532. if not docid:
  533. print(" [跳过] 无法从 manifest URL 提取 docid")
  534. return None
  535. docid_parsed = urlparse(docid)
  536. sp_host = f"{docid_parsed.scheme}://{docid_parsed.hostname}"
  537. path = docid_parsed.path.replace('/_api/v2.0/', '/_api/v2.1/')
  538. item_url = sp_host + path
  539. # Step 1: 获取转录元数据
  540. meta_url = item_url + "?select=media%2Ftranscripts%2CaudioTracks&%24expand=media%2Ftranscripts%2Cmedia%2FaudioTracks"
  541. print(f" 获取转录元数据...")
  542. resp = self.session.get(meta_url, headers=self._sp_headers())
  543. if resp.status_code != 200:
  544. print(f" [跳过] 转录元数据请求返回 {resp.status_code}")
  545. return None
  546. try:
  547. meta = resp.json()
  548. except Exception:
  549. print(f" [跳过] 转录元数据解析失败")
  550. return None
  551. transcripts = meta.get('media', {}).get('transcripts', [])
  552. if not transcripts:
  553. print(f" [跳过] 该视频没有转录")
  554. return None
  555. print(f" 找到 {len(transcripts)} 个转录:")
  556. for t in transcripts:
  557. print(f" - {t.get('displayName', '?')} ({t.get('languageTag', '?')}, {t.get('size', 0)} bytes)")
  558. transcript = next((t for t in transcripts if t.get('isDefault')), transcripts[0])
  559. transcript_id = transcript.get('id', '')
  560. # Step 2: 下载转录内容
  561. download_url = transcript.get('temporaryDownloadUrl', '')
  562. if not download_url:
  563. download_url = f"{item_url}/versions/current/media/transcripts/{transcript_id}/content"
  564. print(f" 下载转录内容...")
  565. resp = self.session.get(download_url, headers=self._sp_headers())
  566. if resp.status_code != 200:
  567. print(f" [跳过] 转录下载返回 {resp.status_code}")
  568. return None
  569. # 确保正确的 UTF-8 编码
  570. resp.encoding = 'utf-8-sig' # 处理 BOM
  571. content = resp.text
  572. os.makedirs(self.output_dir, exist_ok=True)
  573. # 检测格式
  574. is_vtt = content.lstrip('\ufeff').startswith('WEBVTT')
  575. if is_vtt:
  576. # 保存 VTT 原始文件
  577. vtt_path = os.path.join(self.output_dir, "transcript.vtt")
  578. with open(vtt_path, 'w', encoding='utf-8') as f:
  579. f.write(content)
  580. print(f" 已保存 VTT: {vtt_path}")
  581. # 解析 VTT 并转换
  582. entries = self._parse_vtt(content)
  583. if entries:
  584. srt_path = os.path.join(self.output_dir, "transcript.srt")
  585. txt_path = os.path.join(self.output_dir, "transcript.txt")
  586. self._to_srt(entries, srt_path)
  587. self._to_txt(entries, txt_path)
  588. print(f" 已保存 SRT: {srt_path} ({len(entries)} 条)")
  589. print(f" 已保存纯文本: {txt_path}")
  590. return srt_path
  591. return vtt_path
  592. else:
  593. # 尝试 JSON
  594. raw_path = os.path.join(self.output_dir, "transcript.json")
  595. with open(raw_path, 'w', encoding='utf-8') as f:
  596. f.write(content)
  597. print(f" 已保存原始转录: {raw_path}")
  598. try:
  599. data = json.loads(content)
  600. entries = data if isinstance(data, list) else data.get('entries', data.get('cues', []))
  601. if not entries:
  602. for key in data:
  603. if isinstance(data[key], list) and len(data[key]) > 0:
  604. entries = data[key]
  605. break
  606. if entries:
  607. srt_path = os.path.join(self.output_dir, "transcript.srt")
  608. txt_path = os.path.join(self.output_dir, "transcript.txt")
  609. self._to_srt(entries, srt_path)
  610. self._to_txt(entries, txt_path)
  611. print(f" 已保存 SRT: {srt_path}")
  612. print(f" 已保存纯文本: {txt_path}")
  613. return srt_path
  614. except Exception as e:
  615. print(f" JSON 解析失败: {e}")
  616. return raw_path
  617. @staticmethod
  618. def _parse_vtt(content: str) -> list:
  619. """解析 WebVTT 格式为条目列表"""
  620. entries = []
  621. content = content.lstrip('\ufeff') # 去 BOM
  622. blocks = re.split(r'\n\n+', content)
  623. for block in blocks:
  624. lines = block.strip().split('\n')
  625. if len(lines) < 2:
  626. continue
  627. # 找时间行 (HH:MM:SS.mmm --> HH:MM:SS.mmm)
  628. time_line = None
  629. text_start = 0
  630. for j, line in enumerate(lines):
  631. if '-->' in line:
  632. time_line = line
  633. text_start = j + 1
  634. break
  635. if not time_line:
  636. continue
  637. # 解析时间
  638. m = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})', time_line)
  639. if not m:
  640. continue
  641. start_str = m.group(1)
  642. end_str = m.group(2)
  643. def vtt_to_ms(t):
  644. parts = t.split(':')
  645. h, mi = int(parts[0]), int(parts[1])
  646. s_parts = parts[2].split('.')
  647. s, ms = int(s_parts[0]), int(s_parts[1])
  648. return h * 3600000 + mi * 60000 + s * 1000 + ms
  649. text_lines = lines[text_start:]
  650. text = ' '.join(l.strip() for l in text_lines if l.strip())
  651. # 提取说话人 (格式: <v Speaker Name>text</v> 或直接文本)
  652. speaker = ''
  653. speaker_m = re.match(r'<v\s+([^>]+)>(.*?)(?:</v>)?$', text)
  654. if speaker_m:
  655. speaker = speaker_m.group(1).strip()
  656. text = speaker_m.group(2).strip()
  657. entries.append({
  658. 'startTime': vtt_to_ms(start_str),
  659. 'endTime': vtt_to_ms(end_str),
  660. 'text': text,
  661. 'speakerName': speaker,
  662. })
  663. return entries
  664. @staticmethod
  665. def _ms_to_srt_time(ms: float) -> str:
  666. """毫秒转 SRT 时间格式 HH:MM:SS,mmm"""
  667. ms = int(ms)
  668. h = ms // 3600000
  669. m = (ms % 3600000) // 60000
  670. s = (ms % 60000) // 1000
  671. ms_rem = ms % 1000
  672. return f"{h:02d}:{m:02d}:{s:02d},{ms_rem:03d}"
  673. def _to_srt(self, entries: list, path: str):
  674. """转换为 SRT 字幕格式"""
  675. with open(path, 'w', encoding='utf-8') as f:
  676. for i, entry in enumerate(entries, 1):
  677. # 支持多种字段名
  678. start = entry.get('startTime', entry.get('start', entry.get('offset', 0)))
  679. end = entry.get('endTime', entry.get('end', start + entry.get('duration', 0)))
  680. text = entry.get('text', entry.get('value', entry.get('content', '')))
  681. speaker = entry.get('speakerName', entry.get('speaker', entry.get('displayName', '')))
  682. # 时间可能是秒或毫秒
  683. if isinstance(start, (int, float)) and start < 100000:
  684. start *= 1000 # 秒转毫秒
  685. end *= 1000
  686. start_str = self._ms_to_srt_time(start)
  687. end_str = self._ms_to_srt_time(end)
  688. line = f"[{speaker}] {text}" if speaker else text
  689. f.write(f"{i}\n{start_str} --> {end_str}\n{line}\n\n")
  690. def _to_txt(self, entries: list, path: str):
  691. """转换为纯文本 (带说话人和时间戳)"""
  692. with open(path, 'w', encoding='utf-8') as f:
  693. last_speaker = ""
  694. for entry in entries:
  695. start = entry.get('startTime', entry.get('start', entry.get('offset', 0)))
  696. text = entry.get('text', entry.get('value', entry.get('content', '')))
  697. speaker = entry.get('speakerName', entry.get('speaker', entry.get('displayName', '')))
  698. if isinstance(start, (int, float)) and start < 100000:
  699. start *= 1000
  700. mins = int(start / 1000 / 60)
  701. secs = int(start / 1000) % 60
  702. if speaker and speaker != last_speaker:
  703. f.write(f"\n[{speaker}] ({mins:02d}:{secs:02d})\n")
  704. last_speaker = speaker
  705. f.write(f"{text}\n")
  706. def run(self, max_workers: int = 4):
  707. """完整下载流程"""
  708. print("=" * 60)
  709. print("SharePoint Stream 视频下载器")
  710. print("=" * 60)
  711. # Step 1: Manifest
  712. self.fetch_manifest()
  713. video_track, audio_track = self.select_tracks()
  714. if not video_track:
  715. print("[错误] 未找到视频 track")
  716. return
  717. # Step 2: 解密密钥
  718. self.fetch_encryption_key(video_track)
  719. # Step 3: 下载视频
  720. print(f"\n[3/5] 下载视频 track...")
  721. video_path = self.download_track(video_track, "video", max_workers)
  722. # Step 4: 下载音频
  723. audio_path = None
  724. if audio_track:
  725. print(f"\n[4/5] 下载音频 track...")
  726. audio_path = self.download_track(audio_track, "audio", max_workers)
  727. # Step 5: 合并
  728. print(f"\n[5/5] 合并音视频...")
  729. final_path = os.path.join(self.output_dir, "final.mp4")
  730. if audio_path:
  731. ffmpeg_cmd = f'ffmpeg -y -i "{video_path}" -i "{audio_path}" -c copy "{final_path}"'
  732. print(f" 执行: {ffmpeg_cmd}")
  733. ret = os.system(ffmpeg_cmd)
  734. if ret == 0:
  735. print(f" 完成: {final_path}")
  736. size_mb = os.path.getsize(final_path) / 1024 / 1024
  737. print(f" 大小: {size_mb:.1f} MB")
  738. else:
  739. print(f" ffmpeg 失败 (返回码 {ret})")
  740. print(f" 视频和音频已分别保存,请手动合并:")
  741. print(f" {video_path}")
  742. print(f" {audio_path}")
  743. else:
  744. os.rename(video_path, final_path)
  745. print(f" 完成: {final_path}")
  746. # Step 6: 下载转录
  747. self.fetch_transcript()
  748. print("\n" + "=" * 60)
  749. print("下载完成!")
  750. print("=" * 60)
  751. # ============================================================
  752. # 入口
  753. # ============================================================
  754. def main():
  755. parser = argparse.ArgumentParser(
  756. description="SharePoint Stream 视频下载器",
  757. epilog="用法: 从浏览器 DevTools 复制 videomanifest 请求的 cURL 命令"
  758. )
  759. parser.add_argument("input", nargs="?", help="包含 cURL 命令的文件路径")
  760. parser.add_argument("-o", "--output", default="output", help="输出目录 (默认: output)")
  761. parser.add_argument("-w", "--workers", type=int, default=4, help="并发下载线程数 (默认: 4)")
  762. parser.add_argument("-t", "--token", help="driveAccessToken (如果 segment 下载 401,从浏览器 Console 获取)")
  763. parser.add_argument("-c", "--cookie", help="Cookie 字符串或 JSON 文件路径 (从浏览器获取 FedAuth/rtFa)")
  764. parser.add_argument("--dry-run", action="store_true", help="仅解析 manifest,不下载")
  765. parser.add_argument("--transcript-only", action="store_true", help="仅下载转录文件")
  766. args = parser.parse_args()
  767. # 获取 cURL 输入
  768. curl_text = None
  769. if args.input:
  770. with open(args.input, "r", encoding="utf-8") as f:
  771. curl_text = f.read()
  772. else:
  773. print("请粘贴 videomanifest 请求的 cURL 命令 (粘贴完成后按两次回车):")
  774. print("-" * 40)
  775. lines = []
  776. empty_count = 0
  777. while True:
  778. try:
  779. line = input()
  780. if line.strip() == "":
  781. empty_count += 1
  782. if empty_count >= 2:
  783. break
  784. lines.append(line)
  785. else:
  786. empty_count = 0
  787. lines.append(line)
  788. except EOFError:
  789. break
  790. curl_text = "\n".join(lines)
  791. if not curl_text or not curl_text.strip():
  792. print("[错误] 未提供输入")
  793. sys.exit(1)
  794. # 解析
  795. curl_text = curl_text.strip()
  796. if curl_text.lower().startswith("curl"):
  797. parsed = parse_curl(curl_text)
  798. elif curl_text.startswith("http"):
  799. parsed = parse_raw_headers(curl_text)
  800. else:
  801. # 尝试两种方式
  802. try:
  803. parsed = parse_curl(curl_text)
  804. except Exception:
  805. parsed = parse_raw_headers(curl_text)
  806. if not parsed.get("url"):
  807. print("[错误] 无法解析 URL")
  808. sys.exit(1)
  809. # ============================================================
  810. # 检测 OPTIONS preflight 请求 (最常见的错误)
  811. # ============================================================
  812. is_preflight = False
  813. method = parsed.get("method", "GET")
  814. has_acr_headers = any(
  815. k.lower() == "access-control-request-headers"
  816. for k in parsed["headers"]
  817. )
  818. has_pac_token = any(
  819. k.lower() == "x-spopactoken"
  820. for k in parsed["headers"]
  821. )
  822. if method == "OPTIONS" or has_acr_headers:
  823. is_preflight = True
  824. if is_preflight:
  825. print("\n" + "=" * 60)
  826. print("[错误] 你复制的是 OPTIONS preflight 请求,不是实际的 GET 请求!")
  827. print("=" * 60)
  828. print("""
  829. 这是浏览器 CORS 预检请求,里面没有认证 token,无法下载。
  830. 请按以下步骤重新复制:
  831. 1. 打开 DevTools -> Network 标签
  832. 2. 在 Filter 框输入 "videomanifest"
  833. 3. 你会看到两个同名请求:
  834. - 一个 Method 是 OPTIONS (预检) <- 不要复制这个
  835. - 一个 Method 是 GET (实际请求) <- 复制这个!
  836. 4. 右键点击 GET 那个请求 -> Copy -> Copy as cURL (bash)
  837. 如何区分:
  838. - OPTIONS 请求的 headers 里有 "access-control-request-headers"
  839. - GET 请求的 headers 里有 "x-spopactoken: v1.eyJ..." (一个很长的 token)
  840. """)
  841. sys.exit(1)
  842. if not has_pac_token:
  843. print("\n[警告] 未找到 x-spopactoken header,请求可能会 401")
  844. print(" 确保复制的是 GET 请求 (不是 OPTIONS preflight)")
  845. print(f"\n解析到 URL: {parsed['url'][:100]}...")
  846. print(f"Headers: {len(parsed['headers'])} 个")
  847. for k in sorted(parsed["headers"].keys()):
  848. v = parsed["headers"][k]
  849. if len(v) > 60:
  850. v = v[:60] + "..."
  851. print(f" {k}: {v}")
  852. if "videomanifest" not in parsed["url"]:
  853. print("\n[警告] URL 中不包含 'videomanifest',可能不是正确的请求")
  854. resp = input("是否继续? (y/N): ")
  855. if resp.lower() != "y":
  856. sys.exit(0)
  857. # 下载
  858. downloader = SPVideoDownloader(
  859. manifest_url=parsed["url"],
  860. headers=parsed["headers"],
  861. output_dir=args.output,
  862. cookies=args.cookie,
  863. )
  864. # 设置 driveAccessToken
  865. if args.token:
  866. token = args.token.strip()
  867. if token.startswith("access_token="):
  868. token = token[len("access_token="):]
  869. downloader.sp_access_token = token
  870. print(f"\n已设置 driveAccessToken: {token[:40]}...")
  871. if args.dry_run:
  872. downloader.fetch_manifest()
  873. print("\n[DRY RUN] 仅解析 manifest,不下载")
  874. elif args.transcript_only:
  875. downloader.fetch_transcript()
  876. else:
  877. downloader.run(max_workers=args.workers)
  878. if __name__ == "__main__":
  879. main()