pc.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import os
  2. import time
  3. import requests
  4. import schedule
  5. from bs4 import BeautifulSoup
  6. from selenium import webdriver
  7. from selenium.webdriver.common.by import By
  8. def download_excel(url, save_path):
  9. """下载Excel文件"""
  10. headers = {
  11. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
  12. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  13. 'Accept-Language': 'zh-CN,zh;q=0.9',
  14. 'Referer': 'https://www.google.com/' # 模拟合法来源页[3,5](@ref)
  15. }
  16. response = requests.get(url, headers=headers)
  17. with open(save_path, 'wb') as f:
  18. for chunk in response.iter_content(chunk_size=1024):
  19. if chunk:
  20. f.write(chunk)
  21. def find_and_download_monthly_data():
  22. try:
  23. # 1. 访问主页面
  24. main_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
  25. headers = {
  26. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  27. "Accept-Encoding": "gzip, deflate", # 需配合解压处理[1](@ref)
  28. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  29. "Cache-Control": "max-age=0",
  30. "Connection": "keep-alive",
  31. "Cookie": "AV7KYchI7HHaS=5sC6lIXRxEGXW6dT63ZBwGHY4pma1LIP4nuaP5fqUi7S8d7D3nolW7IA9MoTWDQ8S8Pi6.uGvZmBHNYlJsClRVa;...",
  32. # 完整复制浏览器Cookie
  33. "Host": "www.customs.gov.cn",
  34. "Referer": "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html",
  35. "Upgrade-Insecure-Requests": "1",
  36. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0"
  37. }
  38. response = requests.get(main_url, headers=headers)
  39. print("res...{}",response)
  40. soup = BeautifulSoup(response.text, 'html.parser')
  41. # 2. 定位目标行
  42. target_row = None
  43. for row in soup.select('tr'):
  44. if '2025年进出口商品收发货人所在地总值表' in row.text:
  45. target_row = row
  46. break
  47. if not target_row:
  48. print("未找到目标表格行")
  49. return
  50. # 3. 遍历月份链接
  51. month_links = []
  52. for cell in target_row.find_all('a', href=True):
  53. if any(str(m) in cell.text for m in range(1, 13)):
  54. month_links.append(cell['href'])
  55. # 4. 使用Selenium处理动态页面
  56. driver = webdriver.Chrome()
  57. for link in month_links:
  58. full_url = requests.compat.urljoin(main_url, link)
  59. driver.get(full_url)
  60. # 查找下载按钮
  61. download_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下载")]')
  62. excel_url = download_btn.get_attribute('href')
  63. # 保存文件
  64. filename = f"{time.strftime('%Y%m')}_海关数据.xlsx"
  65. download_excel(excel_url, os.path.join('../src/downloads', filename))
  66. driver.quit()
  67. except Exception as e:
  68. print(f"发生错误: {str(e)}")
  69. # 设置每天上午9点执行
  70. schedule.every().day.at("09:00").do(find_and_download_monthly_data)
  71. if __name__ == "__main__":
  72. # 创建下载目录
  73. if not os.path.exists('../src/downloads'):
  74. os.makedirs('../src/downloads')
  75. find_and_download_monthly_data()