| 
					
				 | 
			
			
				@@ -1,8 +1,9 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import random 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import subprocess 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from datetime import datetime, timedelta 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from faker import Faker 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -24,7 +25,7 @@ Path(download_dir).mkdir(parents=True, exist_ok=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def configure_stealth_options(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """增强型反检测配置[1,4](@ref)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     opts = FirefoxOptions() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    log.info("当前下载路径:", Path(download_dir).resolve()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("当前下载路径:", Path(download_dir).resolve()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # 文件下载配置 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     opts.set_preference("browser.download.dir", download_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     opts.set_preference("browser.download.folderList", 2) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -50,7 +51,7 @@ def configure_stealth_options(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     opts.add_argument("--headless") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return opts 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def find_target_links(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def find_target_links(driver, year_month): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """点击列表页链接进入详情页下载文件""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     WebDriverWait(driver, 30).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         EC.presence_of_element_located((By.ID, "conRight")) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -75,7 +76,7 @@ def find_target_links(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 # 在详情页下载文件 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                download_result = download_file_from_detail_page(driver) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                download_result = download_file_from_detail_page(driver, year_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 if download_result == 'stop': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     return 'stop' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 processed_urls.add(link_url) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -90,7 +91,7 @@ def find_target_links(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         log.info(f"下载时发生异常: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def download_file_from_detail_page(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def download_file_from_detail_page(driver, year_month): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     WebDriverWait(driver, 30).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         EC.presence_of_element_located((By.ID, "easysiteText")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     ) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -105,8 +106,13 @@ def download_file_from_detail_page(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             file_name = download_btn.text.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if not file_name: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if file_name.startswith('2022'): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                return 'stop' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if year_month is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if file_name.startswith('2022'): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    return 'stop' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if not file_name.startswith(year_month): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    log.info(f"非 {year_month} 文件: {file_name}, stop") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    return 'stop' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if '美元' in file_name or '商品贸易方式' in file_name or '进出口总值' in file_name or '月度表' in file_name: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info(f'{file_name} 不需要此文件,跳过') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 continue 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -162,32 +168,44 @@ def extract_year_and_month(file_name): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         raise ValueError(f"无法从文件名中提取年份和月份:{file_name}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def extract_rar(rar_path, extract_to): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    """备用解压函数(当 rarfile 失效时使用)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    winrar_path = r"C:\Program Files\WinRAR\Rar.exe"  # 推荐使用 Rar.exe 而非 WinRAR.exe 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    result = subprocess.run( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        cmd, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        stdout=subprocess.PIPE, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        stderr=subprocess.PIPE, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        creationflags=creationflags  # 关键点:隐藏窗口 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if result.returncode == 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"解压成功: {rar_path} → {extract_to}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"解压失败: {result.stderr.decode('gbk')}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def crawl_with_selenium(url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def detect_latest_month(driver, url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.get(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    current_date = datetime.now() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for offset in range(0, 3): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_date = current_date - timedelta(days=offset * 30) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_year = check_date.year 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_month = check_date.month 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        target_title = f"{check_year}年{check_month}月" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            WebDriverWait(driver, 10).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]')) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info(f"已找到最新月份数据 {check_year}-{check_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 看是否已存表,已存则跳过; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            count = base_mysql.get_code_exist(f'{check_year}-{check_month:02d}', "340000") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if count > 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info(f"已存在 {check_year}-{check_month} 数据,跳过") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return f"{check_year}年{check_month}月" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info(f"未找到 {target_title}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log.info("三个月内未找到有效数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def crawl_with_selenium(url, mark): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     driver = webdriver.Firefox(options=configure_stealth_options()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    year_month = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if 'increment' == mark: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        res = detect_latest_month(driver, url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if res is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("安徽省海关没有最新数据更新") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sys.exit(0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        year_month = res 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"检测到最新有效数据:{year_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 注入反检测脚本 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.execute_script(""" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -202,7 +220,7 @@ def crawl_with_selenium(url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         while True: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # 访问当前页 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            result = find_target_links(driver) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            result = find_target_links(driver, year_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if result == 'stop': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -294,7 +312,8 @@ def hierarchical_traversal(root_path): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 gov_commodity_anhui_city.process_folder(md['path']) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'all') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'increment') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     print(f"安徽合肥海关全量数据下载任务完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # 等待5s后执行 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     time.sleep(5) 
			 |