| 
														
															@@ -1,26 +1,19 @@ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import argparse 
														 | 
														
														 | 
														
															 import argparse 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-import os 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import random 
														 | 
														
														 | 
														
															 import random 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import re 
														 | 
														
														 | 
														
															 import re 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import time 
														 | 
														
														 | 
														
															 import time 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from datetime import datetime 
														 | 
														
														 | 
														
															 from datetime import datetime 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from pathlib import Path 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from faker import Faker 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from selenium import webdriver 
														 | 
														
														 | 
														
															 from selenium import webdriver 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from selenium.common import StaleElementReferenceException 
														 | 
														
														 | 
														
															 from selenium.common import StaleElementReferenceException 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from selenium.webdriver import FirefoxOptions, ActionChains 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from selenium.webdriver import ActionChains 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from selenium.webdriver.common.by import By 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from selenium.webdriver.support import expected_conditions as EC 
														 | 
														
														 | 
														
															 from selenium.webdriver.support import expected_conditions as EC 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from selenium.webdriver.support.ui import WebDriverWait 
														 | 
														
														 | 
														
															 from selenium.webdriver.support.ui import WebDriverWait 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import 
														 | 
														
														 | 
														
															 from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from crossborder.quanguo.parse_month_excel import parse_month_table_excel 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from crossborder.quanguo.parse_year_excel import parse_year_table_excel 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from crossborder.utils.constants import DOWNLOAD_DIR 
														 | 
														
														 | 
														
															 from crossborder.utils.constants import DOWNLOAD_DIR 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from crossborder.utils.download_utils import configure_stealth_options, wait_for_download, download_excel 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from selenium.webdriver.common.by import By 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from crossborder.utils.download_utils import configure_stealth_options, download_excel 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from crossborder.utils.log import log 
														 | 
														
														 | 
														
															 from crossborder.utils.log import log 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" 
														 | 
														
														 | 
														
															 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -40,36 +33,62 @@ def generate_table_title(year): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         f"(16){year}年自部分国家(地区)进口商品类章金额表" 
														 | 
														
														 | 
														
															         f"(16){year}年自部分国家(地区)进口商品类章金额表" 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     ] 
														 | 
														
														 | 
														
															     ] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 def process_table_row(row): 
														 | 
														
														 | 
														
															 def process_table_row(row): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    """动态处理表格行数据(Selenium语法)""" 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    """更健壮的表格行处理函数""" 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     try: 
														 | 
														
														 | 
														
															     try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 获取所有表格单元格(td)元素 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        cells = row.find_elements(By.TAG_NAME, 'td') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 使用相对定位获取单元格 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        cells = WebDriverWait(row, 15).until( 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            EC.presence_of_all_elements_located((By.XPATH, "./td")) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        ) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         if len(cells) < 2: 
														 | 
														
														 | 
														
															         if len(cells) < 2: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             return None 
														 | 
														
														 | 
														
															             return None 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 获取表格名 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        table_name = cells[0].text.strip() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 使用文本内容稳定性检查 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        table_name = None 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for attempt in range(3): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                table_name = cells[0].text.strip() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                if table_name:  # 确认文本稳定获取 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                    break 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            except StaleElementReferenceException: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                # 重新获取单元格 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                cells = row.find_elements(By.XPATH, "./td") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                if len(cells) < 2: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                    return None 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                time.sleep(0.5) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        if not table_name: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            return None 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 获取第二列中的所有链接,提取月份和href 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 月份链接处理(使用更稳定的XPATH) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         month_links = [] 
														 | 
														
														 | 
														
															         month_links = [] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        links = cells[1].find_elements(By.TAG_NAME, 'a') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        for a in links: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            # 获取文本并去掉‘月’ 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            month_text = a.text 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            if '月' in month_text: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                month = int(month_text.replace('月', '').strip()) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                href = a.get_attribute('href') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                if href: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                    month_links.append((month, href)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 按月份倒序排列(1-12月) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        month_links.sort(key=lambda x: x[0], reverse=True) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        link_elements = cells[1].find_elements(By.XPATH, ".//a") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for a in link_elements: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                # 添加临时等待避免元素状态变化 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                time.sleep(0.2) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                month_text = a.text.strip() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                if '月' in month_text: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                    month = int(month_text.replace('月', '').strip()) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                    href = a.get_attribute('href') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                    if href: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                        month_links.append((month, href)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            except StaleElementReferenceException: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                continue  # 跳过已无效的链接 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            except Exception as e: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                log.debug(f"处理链接异常: {str(e)}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 如果获取到链接再排序 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        if month_links: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            month_links.sort(key=lambda x: x[0], reverse=True) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															         return (table_name, month_links) 
														 | 
														
														 | 
														
															         return (table_name, month_links) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     except Exception as e: 
														 | 
														
														 | 
														
															     except Exception as e: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        log.info(f"表格行处理异常: {str(e)}") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        log.info(f"表格行处理异常: {str(e)}", exc_info=True) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         return None 
														 | 
														
														 | 
														
															         return None 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -181,3 +200,4 @@ if __name__ == "__main__": 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         log.info("【海关总署】全年数据抓取结束".center(66, "*")) 
														 | 
														
														 | 
														
															         log.info("【海关总署】全年数据抓取结束".center(66, "*")) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         log.info("\n数据清洗入库中...") 
														 | 
														
														 | 
														
															         log.info("\n数据清洗入库中...") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         perform_data_cleanup_and_import(current_year) 
														 | 
														
														 | 
														
															         perform_data_cleanup_and_import(current_year) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        log.info("\n数据清洗入库完毕...") 
														 |