selenium_download.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. import os
  2. import random
  3. import re
  4. import time
  5. from pathlib import Path
  6. from faker import Faker
  7. from selenium import webdriver
  8. from selenium.common import StaleElementReferenceException
  9. from selenium.webdriver import FirefoxOptions, ActionChains
  10. from selenium.webdriver.support import expected_conditions as EC
  11. from selenium.webdriver.support.ui import WebDriverWait
  12. from utils.constants import DOWNLOAD_DIR
  13. from utils.download_utils import configure_stealth_options, wait_for_download, download_excel
  14. from selenium.webdriver.common.by import By
  15. from utils.log import log
  16. YEAR = 2025
  17. TARGET_TABLES = [
  18. f"(2){YEAR}年进出口商品国别(地区)总值表",
  19. f"(4){YEAR}年进出口商品类章总值表",
  20. f"(8){YEAR}年进出口商品收发货人所在地总值表",
  21. f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
  22. f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
  23. ]
  24. base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
  25. download_dir = DOWNLOAD_DIR / "total"
  26. def process_table_row(row):
  27. """动态处理表格行数据(Selenium语法)"""
  28. try:
  29. # 获取所有表格单元格(td)元素
  30. cells = row.find_elements(By.TAG_NAME, 'td')
  31. if len(cells) < 2:
  32. return None
  33. # 获取表格名
  34. table_name = cells[0].text.strip()
  35. # 获取第二列中的所有链接,提取月份和href
  36. month_links = []
  37. links = cells[1].find_elements(By.TAG_NAME, 'a')
  38. for a in links:
  39. # 获取文本并去掉‘月’
  40. month_text = a.text
  41. if '月' in month_text:
  42. month = int(month_text.replace('月', '').strip())
  43. href = a.get_attribute('href')
  44. if href:
  45. month_links.append((month, href))
  46. # 按月份升序排列(1-12月)
  47. month_links.sort(key=lambda x: x[0], reverse=True)
  48. return (table_name, month_links)
  49. except Exception as e:
  50. log.info(f"表格行处理异常: {str(e)}")
  51. return None
  52. def download_monthly_data(driver, table_name, month_data):
  53. """Selenium版单月数据下载[6,8](@ref)"""
  54. month_num, link = month_data
  55. safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
  56. try:
  57. # 执行下载操作
  58. driver.get(f"{link}")
  59. download_btn = WebDriverWait(driver, 15).until(
  60. EC.presence_of_element_located((By.CSS_SELECTOR,
  61. 'span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]'))
  62. )
  63. # log.info(f"excel链接:{download_btn.get_attribute("outerHTML")}")
  64. ActionChains(driver).move_to_element(download_btn).click().perform()
  65. # 等待下载完成
  66. downloaded_file = wait_for_download(download_dir)
  67. # 文件整理
  68. target_dir = Path(f"{download_dir}/{YEAR}/{month_data:02d}月")
  69. target_dir.mkdir(parents=True, exist_ok=True)
  70. # 构造最终文件路径
  71. final_path = target_dir / f"{safe_name}{downloaded_file.suffix}"
  72. # 覆盖处理逻辑
  73. if final_path.exists():
  74. try:
  75. os.remove(final_path) # 删除已有文件
  76. # log.info(f"检测到旧文件,已删除:{final_path}")
  77. except Exception as e:
  78. log.info(f"文件删除失败:{str(e)}")
  79. raise
  80. downloaded_file.rename(final_path)
  81. log.info(f"√ 成功下载:{final_path}")
  82. return True
  83. except Exception as e:
  84. log.info(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
  85. driver.save_screenshot(f'error_{safe_name}_{month_num:02d}.png')
  86. return False
  87. def crawl_with_selenium(url):
  88. driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
  89. try:
  90. driver.execute_script("""
  91. Object.defineProperty(navigator, 'webdriver', {
  92. get: () => undefined
  93. });
  94. window.alert = () => {};
  95. """)
  96. driver.get(url)
  97. WebDriverWait(driver, 30).until(
  98. lambda d: d.execute_script("return document.readyState === 'complete'")
  99. )
  100. while True:
  101. # 动态获取当前有效行(每次循环重新查询)
  102. try:
  103. table = WebDriverWait(driver, 20).until(
  104. EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
  105. )
  106. current_rows = table.find_elements(By.CSS_SELECTOR, "tr:not(:first-child)")
  107. if not current_rows:
  108. log.info("所有表格处理完成")
  109. break
  110. # 仅处理当前首行(避免批量失效)
  111. row = current_rows[0]
  112. result = process_table_row(row)
  113. if result and result[0] in TARGET_TABLES:
  114. table_name, month_links = result
  115. log.info(f"\n开始处理表格:{table_name}")
  116. # 处理月份数据
  117. handle_month_data(driver, table_name, month_links)
  118. # 删除已处理行并验证DOM更新
  119. driver.execute_script("arguments[0].remove()", row)
  120. WebDriverWait(driver, 10).until(
  121. EC.staleness_of(row)
  122. )
  123. time.sleep(random.uniform(1, 3)) # 下载间隔
  124. except StaleElementReferenceException:
  125. log.info("检测到元素失效,自动刷新表格")
  126. driver.refresh()
  127. WebDriverWait(driver, 30).until(
  128. EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
  129. )
  130. finally:
  131. driver.quit()
  132. def handle_month_data(driver, table_name, month_links):
  133. main_window = driver.current_window_handle
  134. for idx, month_data in enumerate(month_links):
  135. if 1 <= month_data[0] <= 12:
  136. # 新标签页策略(防止主页面DOM变更)
  137. driver.switch_to.window(main_window)
  138. driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
  139. driver.switch_to.window(driver.window_handles[-1])
  140. month_num, link = month_data
  141. try:
  142. download_excel(driver, link, YEAR, month_num, table_name, download_dir)
  143. except Exception as e:
  144. log.info(f"【异常】下载失败: {str(e)}")
  145. time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
  146. if __name__ == "__main__":
  147. log.info("【海关总署】全年数据抓取开始".center(66, "*"))
  148. crawl_with_selenium(base_url)
  149. log.info("【海关总署】全年数据抓取结束".center(66, "*"))