| 
					
				 | 
			
			
				@@ -1,6 +1,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import argparse 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import random 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -13,6 +14,7 @@ from selenium.webdriver.support.ui import WebDriverWait 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.constants import DOWNLOAD_DIR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from crossborder.utils.db_helper import DBHelper 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.dingtalk import send_dingtalk_message 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.download_utils import configure_stealth_options, download_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.log import  get_logger 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -116,7 +118,7 @@ def go_to_year_page(driver, year): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def crawl_with_selenium(driver, year, latest_only=False): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def crawl_with_selenium(driver, year, latest_only=False,data_collected=False): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """主抓取函数""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     if year < datetime.now().year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if not go_to_year_page(driver, year): 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -140,8 +142,21 @@ def crawl_with_selenium(driver, year, latest_only=False): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             table_title = generate_table_title(year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if latest_only: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if month_links: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    # 只取第一个月份(最新月份) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    month_links = [month_links[0]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    new_month = month_links[0][0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    log.info(f"【{table_name}】处理最新月份:{new_month}月") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    db = DBHelper() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    count = db.get_total_info_exist(f'{year}-{new_month:02d}') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    if count > 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        log.error(f"数据库已存在【海关总署】 {year}-{new_month:02d} 收发件人数据,本次抓取终止") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            data_collected = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if result and result[0] in table_title: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                handle_month_data(driver, sanitize_filename(table_name), month_links, year=year, latest_only=latest_only) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                handle_month_data(driver, sanitize_filename(table_name), month_links, year=year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             driver.execute_script("arguments[0].remove()", row) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             WebDriverWait(driver, 10).until(EC.staleness_of(row)) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -159,9 +174,10 @@ def sanitize_filename(filename): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '-', filename) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def handle_month_data(driver, table_name, month_links, year, latest_only): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def handle_month_data(driver, table_name, month_links, year): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     global downloaded_tables 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     main_window = driver.current_window_handle 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     for idx, month_data in enumerate(month_links): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if 1 <= month_data[0] <= 12: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # 年度表月度表只下载一次(最新月份数据) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -181,39 +197,105 @@ def handle_month_data(driver, table_name, month_links, year, latest_only): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                log.info(f"【异常】下载失败: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             time.sleep(random.uniform(0.5, 1.5))  # 下载间隔 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def  main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global current_year, start_year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     parser = argparse.ArgumentParser(description="抓取海关总署年度数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     parser.add_argument("--year", type=int, help="起始年份,例如:--year 2023") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     args = parser.parse_args() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     start_time = time.time() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    years_processed = []  # 记录成功处理的年份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    data_collected = False  # 是否有数据被采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver =  None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    current_year = datetime.now().year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    start_year = args.year if args.year else current_year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    years_to_crawl = list(range(start_year, current_year + 1)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    years_to_crawl.reverse() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 1. 确定采集年份范围 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        current_year = datetime.now().year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        start_year = args.year if args.year else current_year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 确保年份有效 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if start_year > current_year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.error(f"起始年份 {start_year} 不能大于当前年份 {current_year}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sys.exit(1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    log.info(f"即将抓取 {start_year} - {current_year} 年度数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    options = configure_stealth_options(download_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    driver = webdriver.Firefox(options=options) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 生成年份序列(从新到旧) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        years_to_crawl = list(range(start_year, current_year + 1)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        years_to_crawl.reverse() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    driver.get(base_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info(f"【海关总署】开始抓取 {start_year}-{current_year} 年度数据".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 2. 初始化浏览器 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        options = configure_stealth_options(download_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        driver = webdriver.Firefox(options=options) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("浏览器初始化完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 3. 访问基础页面 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        driver.get(base_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 4. 年份遍历采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for year in years_to_crawl: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            log.info(f"\n【{year}年】开始抓取...".center(66, "-")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            crawl_with_selenium(driver, year=year, latest_only=args.year is None) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info(f"\n【{year}年】开始处理".center(66, "-")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                is_latest_only = (not args.year) and (year == current_year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                # 执行年份采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                crawl_with_selenium(driver, year=year, latest_only=is_latest_only,data_collected =  data_collected) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                years_processed.append(year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info(f"【{year}年】处理完成".center(66, "-")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.exception(f"⚠️ {year}年数据采集异常: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                send_dingtalk_message(f"【海关总署{year}年采集异常】{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.exception(f"‼️ 海关总署采集全局错误: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        send_dingtalk_message(f"【海关总署全局异常】{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     finally: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info("【海关总署】全年数据抓取结束".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info("\n数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        perform_data_cleanup_and_import(current_year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info("\n数据清洗入库完毕...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 5. 保证浏览器退出 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if 'driver' in locals(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("浏览器已退出") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 6. 数据清洗入库(仅当有数据被采集时) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if data_collected: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("\n【海关总署】数据清洗入库开始".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info("数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                perform_data_cleanup_and_import(years_processed) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info("数据清洗入库完毕") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.exception(f"数据清洗入库异常: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                send_dingtalk_message(f"【海关总署数据清洗异常】{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 7. 生成报告并发送通知 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         duration = time.time() - start_time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        minutes, seconds = divmod(duration, 60)  # 转换为分钟和秒 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        message = f'【海关总署】{start_year}年-{current_year}年数据已采集完毕,总耗时:{int(minutes)}分{seconds:.1f}秒' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        send_dingtalk_message(message) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        minutes, seconds = divmod(duration, 60) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 准备通知消息 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        year_range = f"{start_year}年-{current_year}年" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        total_years = len(years_processed) if data_collected else 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if data_collected: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 成功采集通知 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if years_processed: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                processed_years = "、".join(map(str, years_processed)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                message = ( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    f"【海关总署数据采集完成】\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    f"• 目标年份: {year_range}\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    f"• 实际处理年份: {processed_years}\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    f"• 处理年份数量: {len(years_processed)}个\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    f"• 总耗时: {int(minutes)}分{seconds:.1f}秒" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                send_dingtalk_message(message) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("【海关总署】数据采集任务结束".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   main() 
			 |