| 
					
				 | 
			
			
				@@ -2,6 +2,8 @@ import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import random 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from datetime import datetime, timedelta 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from urllib.parse import urljoin 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -51,7 +53,7 @@ def configure_stealth_options(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     opts.add_argument("--headless") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return opts 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def crawl_by_year_tabs(driver, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def crawl_by_year_tabs(driver, base_url, year_month): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """按年份Tab导航爬取数据""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     years = ['2023年', '2024年', '2025年'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     WebDriverWait(driver, 30).until( 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -74,15 +76,15 @@ def crawl_by_year_tabs(driver, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.switch_to.window(driver.window_handles[-1]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         log.info(f"\n正在处理 {year_text} 年份页面") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        process_month_tabs(driver, year_text, base_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        process_month_tabs(driver, year_text, base_url, year_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 返回主窗口 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.close() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.switch_to.window(driver.window_handles[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def process_month_tabs(driver, year, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def process_month_tabs(driver, year, base_url, year_month): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """处理月份Tab导航(动态获取真实存在的月份)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # ✅ 显式等待容器加载 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 显式等待容器加载 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     WebDriverWait(driver, 30).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         EC.presence_of_element_located((By.CLASS_NAME, "portlet")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     ) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -92,8 +94,7 @@ def process_month_tabs(driver, year, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     processed_months = set()  # 已处理月份记录 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     retry_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # while retry_count < 3:  # 最多重试3次 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    while True:  # 最多重试3次 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    while retry_count < 3: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # 全量获取所有月份Tab 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             month_items = driver.find_elements(By.XPATH, '//ul[@class="nav_tab"]//li') 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -115,6 +116,14 @@ def process_month_tabs(driver, year, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     continue  # 跳过已处理月份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info(f"点击月份Tab:{year}-{month_text}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if year_month is not None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    tar_year, tar_month = year_month.split('-')[0], year_month.split('-')[1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    if tar_year != year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        retry_count += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    if tar_month != month_text: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        log.info(f"{year}年 {month_text} 月份跳过, increment tar: {year_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 a_tag.click() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 # 处理详情页逻辑 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -145,13 +154,10 @@ def process_month_tabs(driver, year, base_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 # 部分月份未找到,重新获取元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                # retry_count += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info(f"第 {retry_count} 次重试获取月份Tab...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                time.sleep(2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         except StaleElementReferenceException: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             log.info("页面刷新,重新获取月份Tab列表...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # retry_count += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             time.sleep(2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     log.info(f"{year}年最终处理的月份:{processed_months}") 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -234,10 +240,60 @@ def extract_year_and_month(file_name): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         raise ValueError(f"无法从文件名中提取年份和月份:{file_name}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def crawl_with_selenium(url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def convert_to_chinese_uppercase(num): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if not 1 <= num <= 12: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return None  # 超出范围的数字返回 None 或根据需要处理 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if num < 10: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return '零一二三四五六七八九'[num] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif num == 10: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return '十' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif num == 11: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return '十一' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif num == 12: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return '十二' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def detect_latest_month(driver, url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.get(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    current_date = datetime.now() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for offset in range(0, 3): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_date = current_date - timedelta(days=offset * 30) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_year = check_date.year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        month = check_date.month 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_month = convert_to_chinese_uppercase(month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        target_title = f"{check_month}月" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            WebDriverWait(driver, 10).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                EC.presence_of_element_located((By.XPATH, f'//ul[@class="nav_tab"]//li/a[normalize-space()="{target_title}"]')) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info(f"已找到最新月份数据 {check_year}-{check_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 看是否已存表,已存则跳过; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            count = base_mysql.get_code_exist(f'{check_year}-{month:02d}', '330000') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if count > 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info(f"count: {count} -> 已存在 {check_year}-{check_month} 数据,跳过") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return f"{check_year}年-{check_month}月" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info(f"未找到 {target_title}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log.info("三个月内未找到有效数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def crawl_with_selenium(url, mark): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     driver = webdriver.Firefox(options=configure_stealth_options()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    base_url = 'http://hangzhou.customs.gov.cn' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    year_month = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if 'increment' == mark: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        res = detect_latest_month(driver, url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if res is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("浙江省海关没有最新数据更新") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sys.exit(0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        year_month = res 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"检测到最新有效数据:{year_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    base_url = 'http://hangzhou.customs.gov.cn' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 注入反检测脚本 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.execute_script(""" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -251,7 +307,7 @@ def crawl_with_selenium(url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.get(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 按年份导航 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        crawl_by_year_tabs(driver, base_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        crawl_by_year_tabs(driver, base_url, year_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     finally: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.quit() 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -316,7 +372,8 @@ def hierarchical_traversal(root_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 gov_commodity_zhejiang_city.process_folder(md['path']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'increment') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     log.info(f"浙江杭州海关全量数据下载任务完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # 等待5s后执行 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     time.sleep(5) 
			 |