123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- import os
- import time
- import requests
- import schedule
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- def download_excel(url, save_path):
- """下载Excel文件"""
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'Referer': 'https://www.google.com/' # 模拟合法来源页[3,5](@ref)
- }
- response = requests.get(url, headers=headers)
- with open(save_path, 'wb') as f:
- for chunk in response.iter_content(chunk_size=1024):
- if chunk:
- f.write(chunk)
- def find_and_download_monthly_data():
- try:
- # 1. 访问主页面
- main_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Encoding": "gzip, deflate", # 需配合解压处理[1](@ref)
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
- "Cache-Control": "max-age=0",
- "Connection": "keep-alive",
- "Cookie": "AV7KYchI7HHaS=5sC6lIXRxEGXW6dT63ZBwGHY4pma1LIP4nuaP5fqUi7S8d7D3nolW7IA9MoTWDQ8S8Pi6.uGvZmBHNYlJsClRVa;...",
- # 完整复制浏览器Cookie
- "Host": "www.customs.gov.cn",
- "Referer": "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0"
- }
- response = requests.get(main_url, headers=headers)
- print("res...{}",response)
- soup = BeautifulSoup(response.text, 'html.parser')
- # 2. 定位目标行
- target_row = None
- for row in soup.select('tr'):
- if '2025年进出口商品收发货人所在地总值表' in row.text:
- target_row = row
- break
- if not target_row:
- print("未找到目标表格行")
- return
- # 3. 遍历月份链接
- month_links = []
- for cell in target_row.find_all('a', href=True):
- if any(str(m) in cell.text for m in range(1, 13)):
- month_links.append(cell['href'])
- # 4. 使用Selenium处理动态页面
- driver = webdriver.Chrome()
- for link in month_links:
- full_url = requests.compat.urljoin(main_url, link)
- driver.get(full_url)
- # 查找下载按钮
- download_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下载")]')
- excel_url = download_btn.get_attribute('href')
- # 保存文件
- filename = f"{time.strftime('%Y%m')}_海关数据.xlsx"
- download_excel(excel_url, os.path.join('../src/downloads', filename))
- driver.quit()
- except Exception as e:
- print(f"发生错误: {str(e)}")
- # 设置每天上午9点执行
- schedule.every().day.at("09:00").do(find_and_download_monthly_data)
- if __name__ == "__main__":
- # 创建下载目录
- if not os.path.exists('../src/downloads'):
- os.makedirs('../src/downloads')
- find_and_download_monthly_data()
|