selenium_download.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. import os
  2. import re
  3. import time
  4. from pathlib import Path
  5. from faker import Faker
  6. from selenium import webdriver
  7. from selenium.common import StaleElementReferenceException
  8. from selenium.webdriver import FirefoxOptions, ActionChains
  9. from selenium.webdriver.support import expected_conditions as EC
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. YEAR = 2025
  12. TARGET_TABLES = [
  13. f"(2){YEAR}年进出口商品国别(地区)总值表",
  14. f"(4){YEAR}年进出口商品类章总值表",
  15. f"(8){YEAR}年进出口商品收发货人所在地总值表",
  16. f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
  17. f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
  18. ]
  19. def wait_for_download_complete(download_dir, timeout=15):
  20. """监控下载目录(包括子目录)变化实现下载等待"""
  21. initial_files = set(Path(download_dir).rglob('*')) # 使用 rglob 递归获取所有文件
  22. start_time = time.time()
  23. while (time.time() - start_time) < timeout:
  24. current_files = set(Path(download_dir).rglob('*')) # 同样使用 rglob 获取当前所有文件
  25. new_files = current_files - initial_files # 获取新增文件
  26. if new_files: # 如果有新文件
  27. return max(new_files, key=lambda f: f.stat().st_ctime) # 返回最新的下载文件
  28. time.sleep(1)
  29. raise TimeoutError("文件下载超时")
  30. from selenium.webdriver.common.by import By
  31. def process_table_row(row):
  32. """动态处理表格行数据(Selenium语法)"""
  33. try:
  34. # 获取所有表格单元格(td)元素
  35. cells = row.find_elements(By.TAG_NAME, 'td')
  36. if len(cells) < 2:
  37. return None
  38. # 获取表格名
  39. table_name = cells[0].text.strip()
  40. # 获取第二列中的所有链接,提取月份和href
  41. month_links = []
  42. links = cells[1].find_elements(By.TAG_NAME, 'a')
  43. for a in links:
  44. # 获取文本并去掉‘月’
  45. month_text = a.text
  46. if '月' in month_text:
  47. month = int(month_text.replace('月', '').strip())
  48. href = a.get_attribute('href')
  49. if href:
  50. month_links.append((month, href))
  51. # 按月份升序排列(1-12月)
  52. month_links.sort(key=lambda x: x[0])
  53. return (table_name, month_links)
  54. except Exception as e:
  55. print(f"表格行处理异常: {str(e)}")
  56. return None
  57. def download_monthly_data(driver, table_name, month_data):
  58. """Selenium版单月数据下载[6,8](@ref)"""
  59. month_num, link = month_data
  60. safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
  61. try:
  62. download_dir = os.path.abspath(f"downloads/{YEAR}")
  63. # initial_files = set(download_dir.glob('*'))
  64. # 执行下载操作
  65. driver.get(f"{link}")
  66. download_btn = WebDriverWait(driver, 15).until(
  67. EC.presence_of_element_located((By.CSS_SELECTOR,
  68. 'span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]'))
  69. )
  70. # print(f"excel链接:{download_btn.get_attribute("outerHTML")}")
  71. ActionChains(driver).move_to_element(download_btn).click().perform()
  72. # 等待下载完成
  73. downloaded_file = wait_for_download_complete(download_dir)
  74. # 文件整理
  75. target_dir = Path(f"{download_dir}/{month_num:02d}月")
  76. target_dir.mkdir(parents=True, exist_ok=True)
  77. # 构造最终文件路径
  78. final_path = target_dir / f"{safe_name}{downloaded_file.suffix}"
  79. # 覆盖处理逻辑
  80. if final_path.exists():
  81. try:
  82. os.remove(final_path) # 删除已有文件
  83. # print(f"检测到旧文件,已删除:{final_path}")
  84. except Exception as e:
  85. print(f"文件删除失败:{str(e)}")
  86. raise
  87. downloaded_file.rename(final_path)
  88. print(f"√ 成功下载:{final_path}")
  89. return True
  90. except Exception as e:
  91. print(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
  92. driver.save_screenshot(f'error_{safe_name}_{month_num:02d}.png')
  93. return False
  94. def configure_stealth_options():
  95. """增强型反检测配置[1,4](@ref)"""
  96. opts = FirefoxOptions()
  97. download_dir = os.path.abspath(f"downloads/{YEAR}")
  98. # 文件下载配置
  99. opts.set_preference("browser.download.dir", download_dir)
  100. opts.set_preference("browser.download.folderList", 2)
  101. opts.set_preference("browser.download.manager.showWhenStarting", False)
  102. opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
  103. "application/octet-stream, application/vnd.ms-excel") # 覆盖常见文件类型
  104. # 反检测参数
  105. opts.set_preference("dom.webdriver.enabled", False)
  106. opts.set_preference("useAutomationExtension", False)
  107. opts.add_argument("--disable-blink-features=AutomationControlled")
  108. # 动态指纹
  109. fake = Faker()
  110. opts.set_preference("general.useragent.override", fake.firefox())
  111. opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
  112. # 视口配置
  113. opts.add_argument("--width=1440")
  114. opts.add_argument("--height=900")
  115. opts.add_argument("--headless")
  116. return opts
  117. def crawl_with_selenium(url):
  118. driver = webdriver.Firefox(options=configure_stealth_options())
  119. try:
  120. # 注入反检测脚本
  121. driver.execute_script("""
  122. Object.defineProperty(navigator, 'webdriver', {
  123. get: () => undefined
  124. });
  125. window.alert = () => {};
  126. """)
  127. # 页面加载策略[7,8](@ref)
  128. driver.get(url)
  129. WebDriverWait(driver, 30).until(
  130. lambda d: d.execute_script("return document.readyState === 'complete'")
  131. )
  132. while True:
  133. # 动态获取当前有效行(每次循环重新查询)
  134. try:
  135. table = WebDriverWait(driver, 20).until(
  136. EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
  137. )
  138. current_rows = table.find_elements(By.CSS_SELECTOR, "tr:not(:first-child)")
  139. if not current_rows:
  140. print("所有表格处理完成")
  141. break
  142. # 仅处理当前首行(避免批量失效)
  143. row = current_rows[0]
  144. result = process_table_row(row)
  145. if result and result[0] in TARGET_TABLES:
  146. table_name, month_links = result
  147. print(f"\n开始处理表格:{table_name}")
  148. # 处理月份数据(关键修改点)
  149. handle_month_data(driver, table_name, month_links)
  150. # 删除已处理行并验证DOM更新
  151. driver.execute_script("arguments[0].remove()", row)
  152. WebDriverWait(driver, 10).until(
  153. EC.staleness_of(row) # 强制等待元素失效[2,7](@ref)
  154. )
  155. except StaleElementReferenceException:
  156. print("检测到元素失效,自动刷新表格")
  157. driver.refresh()
  158. WebDriverWait(driver, 30).until(
  159. EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
  160. )
  161. finally:
  162. driver.quit()
  163. def handle_month_data(driver, table_name, month_links):
  164. main_window = driver.current_window_handle
  165. for idx, month_data in enumerate(month_links):
  166. if 1 <= month_data[0] <= 12:
  167. # 新标签页策略(防止主页面DOM变更)
  168. driver.switch_to.window(main_window)
  169. driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
  170. driver.switch_to.window(driver.window_handles[-1])
  171. # 下载逻辑
  172. try:
  173. if download_monthly_data(driver, table_name, month_data):
  174. print(f"{month_data[0]}月下载成功")
  175. finally:
  176. driver.close()
  177. driver.switch_to.window(main_window)
  178. WebDriverWait(driver, 10).until(
  179. EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
  180. )
  181. if __name__ == "__main__":
  182. Path('downloads').mkdir(exist_ok=True)
  183. target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
  184. crawl_with_selenium(target_url)
  185. print("全年数据下载任务已完成")