import os import time import requests import schedule from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By def download_excel(url, save_path): """下载Excel文件""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Referer': 'https://www.google.com/' # 模拟合法来源页[3,5](@ref) } response = requests.get(url, headers=headers) with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) def find_and_download_monthly_data(): try: # 1. 访问主页面 main_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Encoding": "gzip, deflate", # 需配合解压处理[1](@ref) "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "AV7KYchI7HHaS=5sC6lIXRxEGXW6dT63ZBwGHY4pma1LIP4nuaP5fqUi7S8d7D3nolW7IA9MoTWDQ8S8Pi6.uGvZmBHNYlJsClRVa;...", # 完整复制浏览器Cookie "Host": "www.customs.gov.cn", "Referer": "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0" } response = requests.get(main_url, headers=headers) print("res...{}",response) soup = BeautifulSoup(response.text, 'html.parser') # 2. 定位目标行 target_row = None for row in soup.select('tr'): if '2025年进出口商品收发货人所在地总值表' in row.text: target_row = row break if not target_row: print("未找到目标表格行") return # 3. 遍历月份链接 month_links = [] for cell in target_row.find_all('a', href=True): if any(str(m) in cell.text for m in range(1, 13)): month_links.append(cell['href']) # 4. 使用Selenium处理动态页面 driver = webdriver.Chrome() for link in month_links: full_url = requests.compat.urljoin(main_url, link) driver.get(full_url) # 查找下载按钮 download_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下载")]') excel_url = download_btn.get_attribute('href') # 保存文件 filename = f"{time.strftime('%Y%m')}_海关数据.xlsx" download_excel(excel_url, os.path.join('../src/downloads', filename)) driver.quit() except Exception as e: print(f"发生错误: {str(e)}") # 设置每天上午9点执行 schedule.every().day.at("09:00").do(find_and_download_monthly_data) if __name__ == "__main__": # 创建下载目录 if not os.path.exists('../src/downloads'): os.makedirs('../src/downloads') find_and_download_monthly_data()