wyp
/
crossborder


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
							import os
import time
import requests
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By


def download_excel(url, save_path):
    """下载Excel文件"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Referer': 'https://www.google.com/'  # 模拟合法来源页[3,5](@ref)
    }
    response = requests.get(url, headers=headers)
    with open(save_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)


def find_and_download_monthly_data():
    try:
        # 1. 访问主页面
        main_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Encoding": "gzip, deflate",  # 需配合解压处理[1](@ref)
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Cookie": "AV7KYchI7HHaS=5sC6lIXRxEGXW6dT63ZBwGHY4pma1LIP4nuaP5fqUi7S8d7D3nolW7IA9MoTWDQ8S8Pi6.uGvZmBHNYlJsClRVa;...",
            # 完整复制浏览器Cookie
            "Host": "www.customs.gov.cn",
            "Referer": "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0"
        }
        response = requests.get(main_url, headers=headers)
        print("res...{}",response)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 2. 定位目标行
        target_row = None
        for row in soup.select('tr'):
            if '2025年进出口商品收发货人所在地总值表' in row.text:
                target_row = row
                break
        if not target_row:
            print("未找到目标表格行")
            return

        # 3. 遍历月份链接
        month_links = []
        for cell in target_row.find_all('a', href=True):
            if any(str(m) in cell.text for m in range(1, 13)):
                month_links.append(cell['href'])

        # 4. 使用Selenium处理动态页面
        driver = webdriver.Chrome()
        for link in month_links:
            full_url = requests.compat.urljoin(main_url, link)
            driver.get(full_url)

            # 查找下载按钮
            download_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下载")]')
            excel_url = download_btn.get_attribute('href')

            # 保存文件
            filename = f"{time.strftime('%Y%m')}_海关数据.xlsx"
            download_excel(excel_url, os.path.join('../src/downloads', filename))

        driver.quit()

    except Exception as e:
        print(f"发生错误: {str(e)}")


# 设置每天上午9点执行
schedule.every().day.at("09:00").do(find_and_download_monthly_data)

if __name__ == "__main__":
    # 创建下载目录
    if not os.path.exists('../src/downloads'):
        os.makedirs('../src/downloads')

    find_and_download_monthly_data()