| 
					
				 | 
			
			
				@@ -14,6 +14,7 @@ from selenium.webdriver.support.ui import WebDriverWait 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.henan.henan_parse_excel import parse_excel 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.constants import DOWNLOAD_DIR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from crossborder.utils.db_helper import DBHelper 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.dingtalk import send_dingtalk_message 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.log import  get_logger 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -59,7 +60,7 @@ def detect_latest_month(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     log.info(f"已找到最新月份数据 {check_year}-{check_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     return check_year, check_month 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            log.info(f"未找到匹配项(正则:{pattern.pattern})") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.error(f"未找到匹配项(正则:{pattern.pattern})") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         except TimeoutException: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             log.error(f"页面加载超时或无匹配项({check_year}-{check_month})") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             continue 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -199,22 +200,38 @@ def handle_retry(driver): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    """主入口(优化参数处理逻辑)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    global target_months 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """主入口(优化河南海关数据采集逻辑)""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     parser = argparse.ArgumentParser(description='海关数据智能抓取系统') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     parser.add_argument('--year', type=int, default=None, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         help='终止年份(如2023),未指定时抓取最新两个月') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     args = parser.parse_args() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     start_time = time.time() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    driver = webdriver.Firefox(options=configure_stealth_options(download_dir)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    target_months = []  # 初始化目标月份列表 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    data_collected = False  # 数据采集状态标记 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log.info("【河南海关】数据抓取开始".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 仅初始化浏览器一次,避免重复创建 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 智能检测最新有效月份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 1. 初始化浏览器 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        driver = webdriver.Firefox(options=configure_stealth_options(download_dir)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("浏览器初始化完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 2. 检测最新有效月份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         valid_year, valid_month = detect_latest_month(driver) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info(f"【河南海关】最新数据:{valid_year}年{valid_month:02d}月") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        # 生成目标序列 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 3. 数据存在性检查(仅在未指定年份时执行) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if not args.year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            db = DBHelper() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            count = db.get_code_exist(f'{valid_year}-{valid_month:02d}', "410000") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if count > 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.error(f"数据库已存在【河南省】 {valid_year}-{valid_month:02d} 商品贸易数据,本次抓取终止") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 4. 生成目标月份序列 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if args.year: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # 指定年份时:从最新月到目标年1月 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             target_months = generate_month_sequence( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 start_year=valid_year, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 start_month=valid_month, 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -222,22 +239,44 @@ def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 skip_january=True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # 未指定年份时:取最近两个月 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            target_months = generate_month_sequence(valid_year, valid_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 未指定年份时只抓最近两个月份 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            target_months = generate_month_sequence( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                start_year=valid_year, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                start_month=valid_month 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info(f"【河南海关】目标采集月份序列:{len(target_months)}个月份") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"目标采集月份序列:{target_months}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 5. 执行数据采集 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         reverse_crawler(driver, target_months) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info(f"{len(target_months)}个月份数据已采集完毕") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        data_collected = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info(f"【河南海关】成功采集 {len(target_months)} 个月份数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 6. 数据清洗入库 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("\n【河南海关】数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        traverse_and_process(download_dir, parse_excel, province_name="henan", year=args.year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("数据清洗入库完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 捕获并记录所有异常 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.exception(f"【河南海关】采集过程中发生错误: {str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        send_dingtalk_message(f"【河南海关数据采集异常】{str(e)}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     finally: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        log.info("\n数据清洗入库中...") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        traverse_and_process(download_dir, parse_excel, province_name="henan", year=args.year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        duration = time.time() - start_time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        minutes, seconds = divmod(duration, 60)  # 转换为分钟和秒 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        message = f'【河南海关】{len(target_months)}个月份数据已采集完毕,总耗时:{int(minutes)}分{seconds:.1f}秒' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        send_dingtalk_message(message) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 确保浏览器退出 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if driver: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log.info("浏览器已退出") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 7. 只有在成功采集数据时才发送通知 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if data_collected: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            duration = time.time() - start_time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            minutes, seconds = divmod(duration, 60) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            message = (f"【河南海关】{len(target_months)}个月份数据采集完成" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                       f",总耗时:{int(minutes)}分{seconds:.1f}秒") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            send_dingtalk_message(message) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("【河南海关】处理流程结束".center(66, "*")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 if __name__ == "__main__": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     main() 
			 |