123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- from playwright.sync_api import sync_playwright
- import re
- import time
- import random
- from pathlib import Path
- from faker import Faker
- YEAR = 2023
- TARGET_TABLES = [
- f"(2){YEAR}年进出口商品国别(地区)总值表",
- f"(4){YEAR}年进出口商品类章总值表",
- f"(8){YEAR}年进出口商品收发货人所在地总值表",
- f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
- f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
- ]
- def process_table_row(row):
- """动态处理表格行数据(整合网页2表格解析方案)"""
- try:
- cells = row.locator('td').all()
- if len(cells) < 2:
- return None
- table_name = cells[0].inner_text(timeout=8000).strip()
- month_links = [
- (int(a.inner_text().replace('月', '')), a.get_attribute('href'))
- for a in cells[1].locator('a').all()
- if a.is_visible() and a.get_attribute('href')
- ]
- # 按月份升序排列(1-12月)
- month_links.sort(key=lambda x: x[0])
- return (table_name, month_links)
- except Exception as e:
- print(f"表格行处理异常: {str(e)}")
- return None
- def download_monthly_data(page, table_name, month_data):
- """下载单月数据文件(整合网页4、网页6存储方案)"""
- month_num, link = month_data
- safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
- try:
- with page.expect_download() as download_info:
- page.goto(f"http://www.customs.gov.cn{link}",
- wait_until="networkidle",
- timeout=80000)
- # 通用下载按钮定位策略(适配不同页面结构)
- download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]')
- download_btn.click(timeout=15000)
- download = download_info.value
- file_ext = download.suggested_filename.split('.')[-1] if '.' in download.suggested_filename else 'xls'
- # 创建月份目录(网页6路径规范)
- download_dir = Path('../src/downloads') / f"{YEAR}/{month_num:02d}月"
- download_dir.mkdir(parents=True, exist_ok=True)
- # 规范文件命名
- final_path = download_dir / f"{safe_name}.{file_ext}"
- download.save_as(final_path)
- print(f"√ 成功下载:{final_path}")
- # 返回原始页面并等待恢复(网页8状态管理)
- page.go_back()
- page.wait_for_load_state('networkidle')
- return True
- except Exception as e:
- print(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
- page.screenshot(path=f'error_{safe_name}_{month_num:02d}.png')
- return False
- def crawl_with_fingerprint(url):
- with sync_playwright() as p:
- browser = p.firefox.launch(
- headless=True,
- args=[
- '--disable-blink-features=AutomationControlled',
- '--lang=zh-CN',
- '--window-size=1440,900'
- ]
- )
- context = browser.new_context(
- user_agent=Faker().firefox(),
- viewport={'width': 1440, 'height': 900},
- device_scale_factor=1,
- accept_downloads=True,
- extra_http_headers={
- "Host": "www.customs.gov.cn",
- "Accept-Language": "zh-CN,zh;q=0.9"
- }
- )
- try:
- page = context.new_page()
- page.add_init_script("""
- Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
- window.alert = () => {};
- """)
- # 访问目标页面
- page.goto(url, wait_until="networkidle", timeout=60000)
- rows = page.locator(f'#yb{YEAR}RMB tr').all()[1:] # 跳过标题行
- print(f"共找到 {len(rows)} 个表格")
- for row in rows:
- result = process_table_row(row)
- if not result:
- continue
- table_name, month_links = result
- if table_name not in TARGET_TABLES:
- continue
- print(f"\n开始处理表格:{table_name}")
- # 遍历所有月份(整合网页2遍历方案)
- for month_data in month_links:
- month_num = month_data[0]
- if 1 <= month_num <= 12: # 过滤有效月份
- retry_count = 0
- while retry_count < 2: # 失败重试机制
- if download_monthly_data(page, table_name, month_data):
- break
- retry_count += 1
- time.sleep(5)
- # 随机等待(网页7反爬建议)
- time.sleep(random.uniform(3, 8))
- # 释放元素引用(网页8内存管理)
- row.evaluate('element => element.remove()')
- finally:
- context.close()
- browser.close()
- if __name__ == "__main__":
- Path('../src/downloads').mkdir(exist_ok=True)
- target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html"
- crawl_with_fingerprint(target_url)
- print("全年数据下载任务已完成")
|