selenium_shandong_read.py 6.9 KB


  1. import os
  2. import random
  3. import re
  4. import time
  5. from pathlib import Path
  6. from urllib.parse import urljoin
  7. import pandas as pd
  8. import requests
  9. from faker import Faker
  10. from selenium import webdriver
  11. from selenium.webdriver import FirefoxOptions, ActionChains
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support import expected_conditions as EC
  14. from selenium.webdriver.support.ui import WebDriverWait
  15. YEAR_MONTH = "2025年3月"
  16. TARGET_TITLES = [
  17. f"{YEAR_MONTH}山东省进口20位主要商品总值",
  18. f"{YEAR_MONTH}山东省出口20位主要商品总值",
  19. f"{YEAR_MONTH}山东省各地市进出口总值",
  20. f"{YEAR_MONTH}山东省进出口主要国别(地区)总值"
  21. ]
  22. URL = "http://qingdao.customs.gov.cn/qingdao_customs/406535/fdzdgknr30/406514/406515/index.html"
  23. def process_table_row(row):
  24. """动态处理表格行数据(Selenium语法)"""
  25. try:
  26. # 获取所有表格单元格(td)元素
  27. cells = row.find_elements(By.TAG_NAME, 'td')
  28. if len(cells) < 2:
  29. return None
  30. # 获取表格名
  31. table_name = cells[0].text.strip()
  32. # 获取第二列中的所有链接,提取月份和href
  33. month_links = []
  34. links = cells[1].find_elements(By.TAG_NAME, 'a')
  35. for a in links:
  36. # 获取文本并去掉‘月’
  37. month_text = a.text
  38. if '月' in month_text:
  39. month = int(month_text.replace('月', '').strip())
  40. href = a.get_attribute('href')
  41. if href:
  42. month_links.append((month, href))
  43. # 按月份升序排列(1-12月)
  44. month_links.sort(key=lambda x: x[0])
  45. return (table_name, month_links)
  46. except Exception as e:
  47. print(f"表格行处理异常: {str(e)}")
  48. return None
  49. def configure_stealth_options():
  50. """增强型反检测配置[1,4](@ref)"""
  51. opts = FirefoxOptions()
  52. download_dir = os.path.abspath(os.path.join('../../downloads', "2025"))
  53. print("当前下载路径:", Path(download_dir).resolve())
  54. # 文件下载配置
  55. opts.set_preference("browser.download.dir", download_dir)
  56. opts.set_preference("browser.download.folderList", 2)
  57. opts.set_preference("browser.download.manager.showWhenStarting", False)
  58. opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
  59. "application/octet-stream, application/vnd.ms-excel") # 覆盖常见文件类型
  60. opts.set_preference("browser.download.manager.useWindow", False) # 禁用下载管理器窗口
  61. opts.set_preference("browser.download.manager.showAlertOnComplete", False) # 关闭完成提示
  62. # 反检测参数
  63. opts.set_preference("dom.webdriver.enabled", False)
  64. opts.set_preference("useAutomationExtension", False)
  65. opts.add_argument("--disable-blink-features=AutomationControlled")
  66. # 动态指纹
  67. fake = Faker()
  68. opts.set_preference("general.useragent.override", fake.firefox())
  69. opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
  70. # 视口配置
  71. opts.add_argument("--width=1440")
  72. opts.add_argument("--height=900")
  73. opts.add_argument("--headless")
  74. return opts
  75. def find_target_links(driver):
  76. """定位目标列表项(网页7、8的XPath文本定位方案)"""
  77. WebDriverWait(driver, 20).until(
  78. EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
  79. )
  80. targets = []
  81. # 使用XPath精准匹配标题文本
  82. for title in TARGET_TITLES:
  83. xpath = f'//ul[@class="conList_ul"]//a[@title="{title}"]'
  84. link = WebDriverWait(driver, 10).until(
  85. EC.presence_of_element_located((By.XPATH, xpath))
  86. ).get_attribute("href")
  87. targets.append((title, link))
  88. time.sleep(random.uniform(1, 3)) # 随机延迟防检测
  89. return targets
  90. def wait_for_download_complete(download_dir, timeout=60): # 延长超时时间至60秒
  91. temp_extensions = ('.part', '.crdownload')
  92. start_time = time.time()
  93. while (time.time() - start_time) < timeout:
  94. current_files = set(Path(download_dir).rglob('*'))
  95. # 过滤临时文件和未完成下载的文件
  96. valid_files = {f for f in current_files if not f.name.endswith(temp_extensions)}
  97. if valid_files:
  98. try:
  99. newest_file = max(valid_files, key=lambda f: f.stat().st_ctime)
  100. with newest_file.open('rb') as test_file: # 尝试读取文件
  101. return newest_file
  102. except (PermissionError, IOError):
  103. continue # 文件仍被占用或未完成写入
  104. time.sleep(1)
  105. raise TimeoutError("文件下载超时")
  106. def read_remote_excel(url):
  107. try:
  108. # 发送HTTP请求获取文件流
  109. response = requests.get(url, timeout=30)
  110. response.raise_for_status() # 检查状态码
  111. # 将二进制流转换为DataFrame
  112. excel_data = pd.read_excel(
  113. io=response.content,
  114. engine='openpyxl' # 必须指定引擎(网页6)
  115. )
  116. return excel_data
  117. except requests.exceptions.RequestException as e:
  118. print(f"远程读取失败: {str(e)}")
  119. return None
  120. def download_excel(title ,driver, url):
  121. """处理下载逻辑(网页7的新标签页策略)"""
  122. main_window = driver.current_window_handle
  123. # 新标签页打开(避免主页面DOM变化)
  124. driver.execute_script(f"window.open('{url}')")
  125. driver.switch_to.window(driver.window_handles[-1])
  126. try:
  127. # 等待下载按钮出现
  128. excel_link = WebDriverWait(driver, 15).until(
  129. EC.presence_of_element_located((By.XPATH, '//a[text()="表格下载" and contains(@href, ".xls")]'))
  130. )
  131. # 获取相对路径并转换为绝对URL(关键步骤[2,7](@ref))
  132. relative_url = excel_link.get_attribute("href")
  133. base_url = "http://qingdao.customs.gov.cn" # 根据实际情况调整
  134. absolute_url = urljoin(base_url, relative_url)
  135. print(f"数据:{read_remote_excel(absolute_url)}")
  136. finally:
  137. driver.close()
  138. driver.switch_to.window(main_window)
  139. def crawl_with_selenium(url):
  140. driver = webdriver.Firefox(options=configure_stealth_options())
  141. try:
  142. # 注入反检测脚本
  143. driver.execute_script("""
  144. Object.defineProperty(navigator, 'webdriver', {
  145. get: () => undefined
  146. });
  147. window.alert = () => {};
  148. """)
  149. # 页面加载策略[7,8](@ref)
  150. driver.get(url)
  151. # 获取目标链接
  152. targets = find_target_links(driver)
  153. # 遍历下载
  154. for title, url in targets:
  155. print(f"正在处理:{title}")
  156. download_excel(title, driver, url)
  157. time.sleep(random.randint(5, 10)) # 大间隔防封禁
  158. finally:
  159. driver.quit()
  160. if __name__ == "__main__":
  161. crawl_with_selenium(URL)
  162. print(f"山东省{YEAR_MONTH}下载任务已完成")