| 
					
				 | 
			
			
				@@ -1,26 +1,19 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import argparse 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import random 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from faker import Faker 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium import webdriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium.common import StaleElementReferenceException 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from selenium.webdriver import FirefoxOptions, ActionChains 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver import ActionChains 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.by import By 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium.webdriver.support import expected_conditions as EC 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium.webdriver.support.ui import WebDriverWait 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from crossborder.quanguo.parse_month_excel import parse_month_table_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from crossborder.quanguo.parse_year_excel import parse_year_table_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.constants import DOWNLOAD_DIR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from crossborder.utils.download_utils import configure_stealth_options, wait_for_download, download_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from selenium.webdriver.common.by import By 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from crossborder.utils.download_utils import configure_stealth_options, download_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.log import log 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -40,36 +33,62 @@ def generate_table_title(year): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         f"(16){year}年自部分国家(地区)进口商品类章金额表" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def process_table_row(row): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    """动态处理表格行数据(Selenium语法)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """更健壮的表格行处理函数""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 获取所有表格单元格(td)元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        cells = row.find_elements(By.TAG_NAME, 'td') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 使用相对定位获取单元格 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cells = WebDriverWait(row, 15).until( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            EC.presence_of_all_elements_located((By.XPATH, "./td")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if len(cells) < 2: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 获取表格名 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        table_name = cells[0].text.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 使用文本内容稳定性检查 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        table_name = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for attempt in range(3): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                table_name = cells[0].text.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if table_name:  # 确认文本稳定获取 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except StaleElementReferenceException: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                # 重新获取单元格 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                cells = row.find_elements(By.XPATH, "./td") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if len(cells) < 2: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                time.sleep(0.5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if not table_name: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 获取第二列中的所有链接,提取月份和href 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 月份链接处理(使用更稳定的XPATH) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         month_links = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        links = cells[1].find_elements(By.TAG_NAME, 'a') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        for a in links: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # 获取文本并去掉‘月’ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            month_text = a.text 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if '月' in month_text: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                month = int(month_text.replace('月', '').strip()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                href = a.get_attribute('href') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                if href: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    month_links.append((month, href)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 按月份倒序排列(1-12月) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        month_links.sort(key=lambda x: x[0], reverse=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        link_elements = cells[1].find_elements(By.XPATH, ".//a") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for a in link_elements: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                # 添加临时等待避免元素状态变化 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                time.sleep(0.2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                month_text = a.text.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                if '月' in month_text: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    month = int(month_text.replace('月', '').strip()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    href = a.get_attribute('href') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    if href: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                        month_links.append((month, href)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except StaleElementReferenceException: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                continue  # 跳过已无效的链接 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.debug(f"处理链接异常: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 如果获取到链接再排序 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if month_links: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            month_links.sort(key=lambda x: x[0], reverse=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return (table_name, month_links) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"表格行处理异常: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info(f"表格行处理异常: {str(e)}", exc_info=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -181,3 +200,4 @@ if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         log.info("【海关总署】全年数据抓取结束".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         log.info("\n数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         perform_data_cleanup_and_import(current_year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("\n数据清洗入库完毕...") 
			 |