CrossDownload.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import random
  2. import re
  3. import time
  4. from pathlib import Path
  5. from faker import Faker
  6. from playwright.sync_api import sync_playwright
  7. TARGET_TABLES = [
  8. "(1)2025年进出口商品总值表 A:年度表",
  9. "(1)2025年进出口商品总值表 B:月度表",
  10. "(2)2025年进出口商品国别(地区)总值表",
  11. "(4)2025年进出口商品类章总值表",
  12. "(8)2025年进出口商品收发货人所在地总值表",
  13. "(15)2025年对部分国家(地区)出口商品类章金额表",
  14. "(16)2025年自部分国家(地区)进口商品类章金额表"
  15. ]
  16. def generate_dynamic_fingerprint(page):
  17. """增强型浏览器指纹生成方案(整合网页1、8技术)"""
  18. # ...保持不变...
  19. def process_table_row(row):
  20. """动态元素处理方案(网页4建议)"""
  21. # 使用locator替代静态查询
  22. cells = row.locator('td').all()
  23. if len(cells) < 2:
  24. return None
  25. try:
  26. table_name = cells[0].inner_text(timeout=5000).strip()
  27. month_links = [
  28. (int(a.inner_text().replace('月', '')), a.get_attribute('href'))
  29. for a in cells[1].locator('a.blue').all()
  30. if a.is_visible()
  31. ]
  32. month_links.sort(reverse=True, key=lambda x: x[0])
  33. return (table_name, month_links)
  34. except Exception as e:
  35. print(f"行处理异常: {str(e)}")
  36. return None
  37. def download_excel(page, table_name, month_data):
  38. """优化后的下载方法(整合网页6、7、8方案)"""
  39. max_month, max_link = month_data
  40. safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
  41. try:
  42. # 直接访问下载链接(网页6技术)
  43. with page.expect_download() as download_info:
  44. page.goto(f"http://www.customs.gov.cn{max_link}",
  45. wait_until="networkidle",
  46. timeout=60000)
  47. # 精准定位下载元素(适配新页面结构)
  48. download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]')
  49. download_btn.click(timeout=15000)
  50. download = download_info.value
  51. file_ext = download.suggested_filename.split('.')[-1]
  52. file_name = f"{safe_name}_{max_month}月.{file_ext}"
  53. # 创建下载目录
  54. download_path = Path('../src/downloads') / f"{time.strftime('%Y%m%d')}"
  55. download_path.mkdir(parents=True, exist_ok=True)
  56. # 保存文件(网页8方案)
  57. final_path = download_path / file_name
  58. download.save_as(final_path)
  59. print(f"√ 成功下载: {file_name}")
  60. # 返回原始页面(关键修复点)
  61. page.go_back()
  62. page.wait_for_load_state('networkidle')
  63. return final_path
  64. except Exception as e:
  65. print(f"× 下载失败 {table_name}: {str(e)}")
  66. page.screenshot(path=f'error_{safe_name}.png')
  67. raise
  68. def crawl_with_fingerprint(url):
  69. with sync_playwright() as p:
  70. browser = p.firefox.launch(
  71. headless=True,
  72. args=[
  73. '--disable-blink-features=AutomationControlled',
  74. '--lang=zh-CN',
  75. '--window-size=1440,900'
  76. ]
  77. )
  78. context = browser.new_context(
  79. user_agent=Faker().firefox(),
  80. viewport={'width': 1440, 'height': 900},
  81. device_scale_factor=1,
  82. accept_downloads=True, # 关键参数(网页7建议)
  83. extra_http_headers={
  84. "Host": "www.customs.gov.cn",
  85. "Accept-Language": "zh-CN,zh;q=0.9"
  86. }
  87. )
  88. try:
  89. page = context.new_page()
  90. page.add_init_script("""
  91. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  92. window.alert = () => {};
  93. """)
  94. # 优化后的表格处理流程
  95. page.goto(url, wait_until="networkidle", timeout=60000)
  96. rows_locator = page.locator('#yb2025RMB tr')
  97. for i in range(1, rows_locator.count()):
  98. row = rows_locator.nth(i)
  99. if not row.is_visible():
  100. continue
  101. result = process_table_row(row)
  102. if not result: continue
  103. table_name, month_links = result
  104. if table_name not in TARGET_TABLES: continue
  105. if not month_links: continue
  106. try:
  107. download_excel(page, table_name, month_links[0])
  108. time.sleep(random.uniform(2, 5)) # 随机等待
  109. except Exception as e:
  110. print(f"表格处理中断: {str(e)}")
  111. break
  112. # 释放元素引用(关键修复点)
  113. row.evaluate('element => element.remove()')
  114. finally:
  115. context.close()
  116. browser.close()
  117. if __name__ == "__main__":
  118. Path('../src/downloads').mkdir(exist_ok=True)
  119. crawl_with_fingerprint("http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html")