| 
					
				 | 
			
			
				@@ -6,6 +6,8 @@ import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from datetime import datetime, timedelta 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from pathlib import Path 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.firefox.service import Service 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from webdriver_manager.firefox import GeckoDriverManager 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from faker import Faker 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium import webdriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from selenium.webdriver import FirefoxOptions 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -20,6 +22,8 @@ from crossborder.utils import base_country_code, base_mysql 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.base_country_code import extract_year_month 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.dingtalk import send_dingtalk_message 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from crossborder.utils.log import  get_logger 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import urllib.request 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import urllib.error 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 log = get_logger(__name__) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -190,24 +194,43 @@ def detect_latest_month(driver, url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             return f"{check_year}年{check_month}月" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             log.error(f"未找到 {target_title}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # log.error(f"未找到 {target_title} {e}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             continue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     log.error("三个月内未找到有效数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def check_internet_connection(url="http://www.baidu.com", timeout=5): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """检查网络连接""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        urllib.request.urlopen(url, timeout=timeout) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except urllib.error.URLError: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def crawl_with_selenium(url, mark): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    driver = webdriver.Firefox(options=configure_stealth_options()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if not check_internet_connection(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.error("无法连接到互联网,请检查网络设置") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        raise Exception("网络连接失败") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     year_month = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if 'auto' == mark: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        res = detect_latest_month(driver, url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if res is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            log.info("安徽省海关没有最新数据更新") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        year_month = res 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        print(f"检测到最新有效数据:{year_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 使用WebDriverManager自动管理geckodriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        service = Service(GeckoDriverManager().install()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        driver = webdriver.Firefox(service=service, options=configure_stealth_options()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.info("Firefox WebDriver初始化成功") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        year_month = None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if 'auto' == mark: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            res = detect_latest_month(driver, url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if res is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log.info("安徽省海关没有最新数据更新") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            year_month = res 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            print(f"检测到最新有效数据:{year_month}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 注入反检测脚本 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         driver.execute_script(""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             Object.defineProperty(navigator, 'webdriver', {  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -251,13 +274,14 @@ def crawl_with_selenium(url, mark): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             log.info(f"开始采集 {next_page_url} 页面数据") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     finally: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if driver: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         print(f"安徽省合肥海关全量数据下载任务完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # 等待5s后执行 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        time.sleep(5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        time.sleep(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         hierarchical_traversal(download_dir, year_month) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         print("安徽省海关类章、国家、城市所有文件处理完成!") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        time.sleep(5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        time.sleep(3) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         base_mysql.update_shandong_yoy('安徽省') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         print("安徽省合肥海关城市同比sql处理完成") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return 'finish', year_month 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -353,7 +377,8 @@ def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                     send_dingtalk_message(f'【安徽省海关】 {r2} 增量数据采集完成,{int(minutes)}分{seconds:.1f}秒') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        send_dingtalk_message(f'【安徽省海关】发生错误:{e}') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # send_dingtalk_message(f'【安徽省海关】发生错误:{e}') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log.error(f'【安徽省海关】发生错误:{e}') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 if __name__ == '__main__': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     main() 
			 |