Browse Source

fix anhui

zhangfan 2 months ago
parent
commit
3e7d614b27
1 changed files with 38 additions and 13 deletions
  1. 38 13
      crossborder/anhui/crawl_gov_anhui_full.py

+ 38 - 13
crossborder/anhui/crawl_gov_anhui_full.py

@@ -6,6 +6,8 @@ import time
 from datetime import datetime, timedelta
 from pathlib import Path
 
+from selenium.webdriver.firefox.service import Service
+from webdriver_manager.firefox import GeckoDriverManager
 from faker import Faker
 from selenium import webdriver
 from selenium.webdriver import FirefoxOptions
@@ -20,6 +22,8 @@ from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
 from crossborder.utils.log import  get_logger
+import urllib.request
+import urllib.error
 
 log = get_logger(__name__)
 
@@ -190,24 +194,43 @@ def detect_latest_month(driver, url):
                 continue
             return f"{check_year}年{check_month}月"
         except:
+        # except Exception as e:
             log.error(f"未找到 {target_title}")
+            # log.error(f"未找到 {target_title} {e}")
             continue
     log.error("三个月内未找到有效数据")
     return None
 
+def check_internet_connection(url="http://www.baidu.com", timeout=5):
+    """检查网络连接"""
+    try:
+        urllib.request.urlopen(url, timeout=timeout)
+        return True
+    except urllib.error.URLError:
+        return False
+
 def crawl_with_selenium(url, mark):
-    driver = webdriver.Firefox(options=configure_stealth_options())
+    if not check_internet_connection():
+        log.error("无法连接到互联网,请检查网络设置")
+        raise Exception("网络连接失败")
 
+    driver = None
     year_month = None
-    if 'auto' == mark:
-        res = detect_latest_month(driver, url)
-        if res is None:
-            log.info("安徽省海关没有最新数据更新")
-            return None
-        year_month = res
-        print(f"检测到最新有效数据:{year_month}")
-
     try:
+        # 使用WebDriverManager自动管理geckodriver
+        service = Service(GeckoDriverManager().install())
+        driver = webdriver.Firefox(service=service, options=configure_stealth_options())
+        log.info("Firefox WebDriver初始化成功")
+
+        year_month = None
+        if 'auto' == mark:
+            res = detect_latest_month(driver, url)
+            if res is None:
+                log.info("安徽省海关没有最新数据更新")
+                return None
+            year_month = res
+            print(f"检测到最新有效数据:{year_month}")
+
         # 注入反检测脚本
         driver.execute_script("""
             Object.defineProperty(navigator, 'webdriver', { 
@@ -251,13 +274,14 @@ def crawl_with_selenium(url, mark):
             log.info(f"开始采集 {next_page_url} 页面数据")
 
     finally:
-        driver.quit()
+        if driver:
+            driver.quit()
         print(f"安徽省合肥海关全量数据下载任务完成")
         # 等待5s后执行
-        time.sleep(5)
+        time.sleep(3)
         hierarchical_traversal(download_dir, year_month)
         print("安徽省海关类章、国家、城市所有文件处理完成!")
-        time.sleep(5)
+        time.sleep(3)
         base_mysql.update_shandong_yoy('安徽省')
         print("安徽省合肥海关城市同比sql处理完成")
     return 'finish', year_month
@@ -353,7 +377,8 @@ def main():
                     send_dingtalk_message(f'【安徽省海关】 {r2} 增量数据采集完成,{int(minutes)}分{seconds:.1f}秒')
 
     except Exception as e:
-        send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
+        # send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
+        log.error(f'【安徽省海关】发生错误:{e}')
 
 if __name__ == '__main__':
     main()