from playwright.sync_api import sync_playwright import re import time import random from pathlib import Path from faker import Faker YEAR = 2023 TARGET_TABLES = [ f"(2){YEAR}年进出口商品国别(地区)总值表", f"(4){YEAR}年进出口商品类章总值表", f"(8){YEAR}年进出口商品收发货人所在地总值表", f"(15){YEAR}年对部分国家(地区)出口商品类章金额表", f"(16){YEAR}年自部分国家(地区)进口商品类章金额表" ] def process_table_row(row): """动态处理表格行数据(整合网页2表格解析方案)""" try: cells = row.locator('td').all() if len(cells) < 2: return None table_name = cells[0].inner_text(timeout=8000).strip() month_links = [ (int(a.inner_text().replace('月', '')), a.get_attribute('href')) for a in cells[1].locator('a').all() if a.is_visible() and a.get_attribute('href') ] # 按月份升序排列(1-12月) month_links.sort(key=lambda x: x[0]) return (table_name, month_links) except Exception as e: print(f"表格行处理异常: {str(e)}") return None def download_monthly_data(page, table_name, month_data): """下载单月数据文件(整合网页4、网页6存储方案)""" month_num, link = month_data safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_') try: with page.expect_download() as download_info: page.goto(f"http://www.customs.gov.cn{link}", wait_until="networkidle", timeout=80000) # 通用下载按钮定位策略(适配不同页面结构) download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]') download_btn.click(timeout=15000) download = download_info.value file_ext = download.suggested_filename.split('.')[-1] if '.' in download.suggested_filename else 'xls' # 创建月份目录(网页6路径规范) download_dir = Path('../src/downloads') / f"{YEAR}/{month_num:02d}月" download_dir.mkdir(parents=True, exist_ok=True) # 规范文件命名 final_path = download_dir / f"{safe_name}.{file_ext}" download.save_as(final_path) print(f"√ 成功下载:{final_path}") # 返回原始页面并等待恢复(网页8状态管理) page.go_back() page.wait_for_load_state('networkidle') return True except Exception as e: print(f"× 下载失败 {table_name} {month_num}月:{str(e)}") page.screenshot(path=f'error_{safe_name}_{month_num:02d}.png') return False def crawl_with_fingerprint(url): with sync_playwright() as p: browser = p.firefox.launch( headless=True, args=[ '--disable-blink-features=AutomationControlled', '--lang=zh-CN', '--window-size=1440,900' ] ) context = browser.new_context( user_agent=Faker().firefox(), viewport={'width': 1440, 'height': 900}, device_scale_factor=1, accept_downloads=True, extra_http_headers={ "Host": "www.customs.gov.cn", "Accept-Language": "zh-CN,zh;q=0.9" } ) try: page = context.new_page() page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); window.alert = () => {}; """) # 访问目标页面 page.goto(url, wait_until="networkidle", timeout=60000) rows = page.locator(f'#yb{YEAR}RMB tr').all()[1:] # 跳过标题行 print(f"共找到 {len(rows)} 个表格") for row in rows: result = process_table_row(row) if not result: continue table_name, month_links = result if table_name not in TARGET_TABLES: continue print(f"\n开始处理表格:{table_name}") # 遍历所有月份(整合网页2遍历方案) for month_data in month_links: month_num = month_data[0] if 1 <= month_num <= 12: # 过滤有效月份 retry_count = 0 while retry_count < 2: # 失败重试机制 if download_monthly_data(page, table_name, month_data): break retry_count += 1 time.sleep(5) # 随机等待(网页7反爬建议) time.sleep(random.uniform(3, 8)) # 释放元素引用(网页8内存管理) row.evaluate('element => element.remove()') finally: context.close() browser.close() if __name__ == "__main__": Path('../src/downloads').mkdir(exist_ok=True) target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html" crawl_with_fingerprint(target_url) print("全年数据下载任务已完成")