Browse Source

4省参数设置提取为公共方法

zhangfan 2 months ago
parent
commit
9db001786a

+ 5 - 37
crossborder/anhui/crawl_gov_anhui_full.py

@@ -3,14 +3,12 @@ import os
 import random
 import re
 import time
+import urllib.error
+import urllib.request
 from datetime import datetime, timedelta
 from pathlib import Path
 
-from selenium.webdriver.firefox.service import Service
-from webdriver_manager.firefox import GeckoDriverManager
-from faker import Faker
 from selenium import webdriver
-from selenium.webdriver import FirefoxOptions
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
@@ -21,41 +19,11 @@ from crossborder.anhui import gov_commodity_anhui_import_export
 from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
-from crossborder.utils.log import  get_logger
-import urllib.request
-import urllib.error
+from crossborder.utils.download_utils import configure_stealth_options
+from crossborder.utils.log import get_logger
 
 log = get_logger(__name__)
 
-def configure_stealth_options():
-    """增强型反检测配置[1,4](@ref)"""
-    opts = FirefoxOptions()
-    print("当前下载路径:", Path(download_dir).resolve())
-    # 文件下载配置
-    opts.set_preference("browser.download.dir", download_dir)
-    opts.set_preference("browser.download.folderList", 2)
-    opts.set_preference("browser.download.manager.showWhenStarting", False)
-    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
-                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
-    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
-    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
-
-    # 反检测参数
-    opts.set_preference("dom.webdriver.enabled", False)
-    opts.set_preference("useAutomationExtension", False)
-    opts.add_argument("--disable-blink-features=AutomationControlled")
-
-    # 动态指纹
-    fake = Faker()
-    opts.set_preference("general.useragent.override", fake.firefox())
-    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
-
-    # 视口配置
-    opts.add_argument("--width=1440")
-    opts.add_argument("--height=900")
-    opts.add_argument("--headless")
-    return opts
-
 def find_target_links(driver, year_month):
     """点击列表页链接进入详情页下载文件"""
     WebDriverWait(driver, 30).until(
@@ -220,7 +188,7 @@ def crawl_with_selenium(url, mark):
         # 使用WebDriverManager自动管理geckodriver
         # service = Service(GeckoDriverManager().install())
         # driver = webdriver.Firefox(service=service, options=configure_stealth_options())
-        driver = webdriver.Firefox(options=configure_stealth_options())
+        driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
         log.info("Firefox WebDriver初始化成功")
 
         year_month = None

+ 3 - 34
crossborder/hebei/crawl_gov_hebei_full.py

@@ -6,9 +6,7 @@ import time
 from datetime import datetime, timedelta
 from pathlib import Path
 
-from faker import Faker
 from selenium import webdriver
-from selenium.webdriver import FirefoxOptions
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
@@ -20,7 +18,8 @@ from crossborder.hebei import gov_commodity_hebei_import_export
 from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
-from crossborder.utils.log import  get_logger
+from crossborder.utils.download_utils import configure_stealth_options
+from crossborder.utils.log import get_logger
 
 log = get_logger(__name__)
 
@@ -33,36 +32,6 @@ def get_current_target_titles():
         f"2025年4月河北分地市"
     ]
 
-def configure_stealth_options():
-    """增强型反检测配置[1,4](@ref)"""
-    opts = FirefoxOptions()
-    print("当前下载路径:", Path(download_dir).resolve())
-    # 文件下载配置
-    opts.set_preference("browser.download.dir", download_dir)
-    opts.set_preference("browser.download.folderList", 2)
-    opts.set_preference("browser.download.manager.showWhenStarting", False)
-    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
-                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
-    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
-    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
-
-    # 反检测参数
-    opts.set_preference("dom.webdriver.enabled", False)
-    opts.set_preference("useAutomationExtension", False)
-    opts.add_argument("--disable-blink-features=AutomationControlled")
-
-    # 动态指纹
-    fake = Faker()
-    opts.set_preference("general.useragent.override", fake.firefox())
-    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
-
-    # 视口配置
-    opts.add_argument("--width=1440")
-    opts.add_argument("--height=900")
-    opts.add_argument("--headless")
-    return opts
-
-
 def remove_prefix_from_url(url):
     # 分离路径和文件名
     path_parts = url.split('/')
@@ -193,7 +162,7 @@ def detect_latest_month(driver, url):
     return None
 
 def crawl_with_selenium(url, mark):
-    driver = webdriver.Firefox(options=configure_stealth_options())
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
 
     year_month = None
     if 'auto' == mark:

+ 2 - 30
crossborder/jiangsu/crawl_gov_jiangsu_full.py

@@ -26,6 +26,7 @@ from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.base_country_code import extract_year_month
 from crossborder.utils.dingtalk import send_dingtalk_message
 from crossborder.utils.log import  get_logger
+from crossborder.utils.download_utils import configure_stealth_options
 
 log = get_logger(__name__)
 
@@ -34,35 +35,6 @@ if sys.platform.startswith('linux'):
 else:
     rarfile.UNRAR_EXECUTABLE = r"C:\Program Files\WinRAR\UnRAR.exe"
 
-def configure_stealth_options():
-    """增强型反检测配置[1,4](@ref)"""
-    opts = FirefoxOptions()
-    print("当前下载路径:", Path(download_dir).resolve())
-    # 文件下载配置
-    opts.set_preference("browser.download.dir", download_dir)
-    opts.set_preference("browser.download.folderList", 2)
-    opts.set_preference("browser.download.manager.showWhenStarting", False)
-    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
-                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
-    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
-    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
-
-    # 反检测参数
-    opts.set_preference("dom.webdriver.enabled", False)
-    opts.set_preference("useAutomationExtension", False)
-    opts.add_argument("--disable-blink-features=AutomationControlled")
-
-    # 动态指纹
-    fake = Faker()
-    opts.set_preference("general.useragent.override", fake.firefox())
-    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
-
-    # 视口配置
-    opts.add_argument("--width=1440")
-    opts.add_argument("--height=900")
-    opts.add_argument("--headless")
-    return opts
-
 def find_target_links(driver, year_month):
     """在当前页面找到符合 TARGET_TITLES 的文件并触发下载"""
     # 等待页面加载完成
@@ -219,7 +191,7 @@ def detect_latest_month(driver, url):
     return None
 
 def crawl_with_selenium(url, mark):
-    driver = webdriver.Firefox(options=configure_stealth_options())
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
 
     year_month = None
     if 'auto' == mark:

+ 4 - 36
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -3,59 +3,27 @@ import os
 import random
 import re
 import time
-import sys
 from datetime import datetime, timedelta
 from pathlib import Path
 from urllib.parse import urljoin
 
-from faker import Faker
 from selenium import webdriver
 from selenium.common.exceptions import StaleElementReferenceException
-from selenium.webdriver import FirefoxOptions
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from crossborder.utils.base_country_code import extract_year_month
+from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.dingtalk import send_dingtalk_message
+from crossborder.utils.download_utils import configure_stealth_options
+from crossborder.utils.log import get_logger
 from crossborder.zhejiang import download_dir
 from crossborder.zhejiang import gov_commodity_zhejiang_city
 from crossborder.zhejiang import gov_commodity_zhejiang_country
 from crossborder.zhejiang import gov_commodity_zhejiang_import_export
-from crossborder.utils import base_country_code, base_mysql
-from crossborder.utils.log import  get_logger
 
 log = get_logger(__name__)
 
-def configure_stealth_options():
-    """增强型反检测配置[1,4](@ref)"""
-    opts = FirefoxOptions()
-    print("当前下载路径:", Path(download_dir).resolve())
-    # 文件下载配置
-    opts.set_preference("browser.download.dir", download_dir)
-    opts.set_preference("browser.download.folderList", 2)
-    opts.set_preference("browser.download.manager.showWhenStarting", False)
-    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
-                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
-    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
-    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
-
-    # 反检测参数
-    opts.set_preference("dom.webdriver.enabled", False)
-    opts.set_preference("useAutomationExtension", False)
-    opts.add_argument("--disable-blink-features=AutomationControlled")
-
-    # 动态指纹
-    fake = Faker()
-    opts.set_preference("general.useragent.override", fake.firefox())
-    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
-
-    # 视口配置
-    opts.add_argument("--width=1440")
-    opts.add_argument("--height=900")
-    opts.add_argument("--headless")
-    return opts
-
 def crawl_by_year_tabs(driver, base_url, year_month):
     """按年份Tab导航采集数据"""
     years = ['2023年', '2024年', '2025年']
@@ -387,7 +355,7 @@ def extract_year_month_chinese(text):
     return year, month
 
 def crawl_with_selenium(url, mark):
-    driver = webdriver.Firefox(options=configure_stealth_options())
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
 
     year_month = None
     if 'auto' == mark: