CrossDownloadYear.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. from playwright.sync_api import sync_playwright
  2. import re
  3. import time
  4. import random
  5. from pathlib import Path
  6. from faker import Faker
  7. YEAR = 2023
  8. TARGET_TABLES = [
  9. f"(2){YEAR}年进出口商品国别(地区)总值表",
  10. f"(4){YEAR}年进出口商品类章总值表",
  11. f"(8){YEAR}年进出口商品收发货人所在地总值表",
  12. f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
  13. f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
  14. ]
  15. def process_table_row(row):
  16. """动态处理表格行数据(整合网页2表格解析方案)"""
  17. try:
  18. cells = row.locator('td').all()
  19. if len(cells) < 2:
  20. return None
  21. table_name = cells[0].inner_text(timeout=8000).strip()
  22. month_links = [
  23. (int(a.inner_text().replace('月', '')), a.get_attribute('href'))
  24. for a in cells[1].locator('a').all()
  25. if a.is_visible() and a.get_attribute('href')
  26. ]
  27. # 按月份升序排列(1-12月)
  28. month_links.sort(key=lambda x: x[0])
  29. return (table_name, month_links)
  30. except Exception as e:
  31. print(f"表格行处理异常: {str(e)}")
  32. return None
  33. def download_monthly_data(page, table_name, month_data):
  34. """下载单月数据文件(整合网页4、网页6存储方案)"""
  35. month_num, link = month_data
  36. safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
  37. try:
  38. with page.expect_download() as download_info:
  39. page.goto(f"http://www.customs.gov.cn{link}",
  40. wait_until="networkidle",
  41. timeout=80000)
  42. # 通用下载按钮定位策略(适配不同页面结构)
  43. download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]')
  44. download_btn.click(timeout=15000)
  45. download = download_info.value
  46. file_ext = download.suggested_filename.split('.')[-1] if '.' in download.suggested_filename else 'xls'
  47. # 创建月份目录(网页6路径规范)
  48. download_dir = Path('../src/downloads') / f"{YEAR}/{month_num:02d}月"
  49. download_dir.mkdir(parents=True, exist_ok=True)
  50. # 规范文件命名
  51. final_path = download_dir / f"{safe_name}.{file_ext}"
  52. download.save_as(final_path)
  53. print(f"√ 成功下载:{final_path}")
  54. # 返回原始页面并等待恢复(网页8状态管理)
  55. page.go_back()
  56. page.wait_for_load_state('networkidle')
  57. return True
  58. except Exception as e:
  59. print(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
  60. page.screenshot(path=f'error_{safe_name}_{month_num:02d}.png')
  61. return False
  62. def crawl_with_fingerprint(url):
  63. with sync_playwright() as p:
  64. browser = p.firefox.launch(
  65. headless=True,
  66. args=[
  67. '--disable-blink-features=AutomationControlled',
  68. '--lang=zh-CN',
  69. '--window-size=1440,900'
  70. ]
  71. )
  72. context = browser.new_context(
  73. user_agent=Faker().firefox(),
  74. viewport={'width': 1440, 'height': 900},
  75. device_scale_factor=1,
  76. accept_downloads=True,
  77. extra_http_headers={
  78. "Host": "www.customs.gov.cn",
  79. "Accept-Language": "zh-CN,zh;q=0.9"
  80. }
  81. )
  82. try:
  83. page = context.new_page()
  84. page.add_init_script("""
  85. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  86. window.alert = () => {};
  87. """)
  88. # 访问目标页面
  89. page.goto(url, wait_until="networkidle", timeout=60000)
  90. rows = page.locator(f'#yb{YEAR}RMB tr').all()[1:] # 跳过标题行
  91. print(f"共找到 {len(rows)} 个表格")
  92. for row in rows:
  93. result = process_table_row(row)
  94. if not result:
  95. continue
  96. table_name, month_links = result
  97. if table_name not in TARGET_TABLES:
  98. continue
  99. print(f"\n开始处理表格:{table_name}")
  100. # 遍历所有月份(整合网页2遍历方案)
  101. for month_data in month_links:
  102. month_num = month_data[0]
  103. if 1 <= month_num <= 12: # 过滤有效月份
  104. retry_count = 0
  105. while retry_count < 2: # 失败重试机制
  106. if download_monthly_data(page, table_name, month_data):
  107. break
  108. retry_count += 1
  109. time.sleep(5)
  110. # 随机等待(网页7反爬建议)
  111. time.sleep(random.uniform(3, 8))
  112. # 释放元素引用(网页8内存管理)
  113. row.evaluate('element => element.remove()')
  114. finally:
  115. context.close()
  116. browser.close()
  117. if __name__ == "__main__":
  118. Path('../src/downloads').mkdir(exist_ok=True)
  119. target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html"
  120. crawl_with_fingerprint(target_url)
  121. print("全年数据下载任务已完成")