| 
					
				 | 
			
			
				@@ -24,7 +24,7 @@ log = get_logger(__name__) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 download_dir = DOWNLOAD_DIR / "total" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 downloaded_tables = set()  # 已下载的表格名集合 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+data_collected = False  # 是否有数据被采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def generate_table_title(year): 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -118,7 +118,8 @@ def go_to_year_page(driver, year): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-def crawl_with_selenium(driver, year, latest_only=False,data_collected=False): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def crawl_with_selenium(driver, year, latest_only=False): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    global data_collected 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     """主抓取函数""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     if year < datetime.now().year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if not go_to_year_page(driver, year): 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -195,7 +196,7 @@ def handle_month_data(driver, table_name, month_links, year): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                # 下载成功后将表格名加入集合 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                downloaded_tables.add(table_name) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-               log.info(f"【异常】下载失败: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+               log.error(f"【异常】{year}-{month_num:02d} {table_name}: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             time.sleep(random.uniform(0.5, 1.5))  # 下载间隔 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def  main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -206,7 +207,6 @@ def  main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     start_time = time.time() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     years_processed = []  # 记录成功处理的年份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    data_collected = False  # 是否有数据被采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     driver =  None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -236,12 +236,12 @@ def  main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 4. 年份遍历采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for year in years_to_crawl: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                log.info(f"\n【{year}年】开始处理".center(66, "-")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info(f"【{year}年】开始处理".center(66, "-")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 is_latest_only = (not args.year) and (year == current_year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 # 执行年份采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                crawl_with_selenium(driver, year=year, latest_only=is_latest_only,data_collected =  data_collected) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                crawl_with_selenium(driver, year=year, latest_only=is_latest_only) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 years_processed.append(year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info(f"【{year}年】处理完成".center(66, "-")) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -263,11 +263,11 @@ def  main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 6. 数据清洗入库(仅当有数据被采集时) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if data_collected: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            log.info("\n【海关总署】数据清洗入库开始".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("【海关总署】数据清洗入库开始".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info("数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                perform_data_cleanup_and_import(years_processed) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                perform_data_cleanup_and_import(current_year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 log.info("数据清洗入库完毕") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             except Exception as e: 
			 |