|
|
@@ -3,59 +3,27 @@ import os
|
|
|
import random
|
|
|
import re
|
|
|
import time
|
|
|
-import sys
|
|
|
from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
-from faker import Faker
|
|
|
from selenium import webdriver
|
|
|
from selenium.common.exceptions import StaleElementReferenceException
|
|
|
-from selenium.webdriver import FirefoxOptions
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
-from crossborder.utils.base_country_code import extract_year_month
|
|
|
+from crossborder.utils import base_country_code, base_mysql
|
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
|
+from crossborder.utils.download_utils import configure_stealth_options
|
|
|
+from crossborder.utils.log import get_logger
|
|
|
from crossborder.zhejiang import download_dir
|
|
|
from crossborder.zhejiang import gov_commodity_zhejiang_city
|
|
|
from crossborder.zhejiang import gov_commodity_zhejiang_country
|
|
|
from crossborder.zhejiang import gov_commodity_zhejiang_import_export
|
|
|
-from crossborder.utils import base_country_code, base_mysql
|
|
|
-from crossborder.utils.log import get_logger
|
|
|
|
|
|
log = get_logger(__name__)
|
|
|
|
|
|
-def configure_stealth_options():
|
|
|
- """增强型反检测配置[1,4](@ref)"""
|
|
|
- opts = FirefoxOptions()
|
|
|
- print("当前下载路径:", Path(download_dir).resolve())
|
|
|
- # 文件下载配置
|
|
|
- opts.set_preference("browser.download.dir", download_dir)
|
|
|
- opts.set_preference("browser.download.folderList", 2)
|
|
|
- opts.set_preference("browser.download.manager.showWhenStarting", False)
|
|
|
- opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
|
|
|
- "application/octet-stream, application/vnd.ms-excel") # 覆盖常见文件类型
|
|
|
- opts.set_preference("browser.download.manager.useWindow", False) # 禁用下载管理器窗口
|
|
|
- opts.set_preference("browser.download.manager.showAlertOnComplete", False) # 关闭完成提示
|
|
|
-
|
|
|
- # 反检测参数
|
|
|
- opts.set_preference("dom.webdriver.enabled", False)
|
|
|
- opts.set_preference("useAutomationExtension", False)
|
|
|
- opts.add_argument("--disable-blink-features=AutomationControlled")
|
|
|
-
|
|
|
- # 动态指纹
|
|
|
- fake = Faker()
|
|
|
- opts.set_preference("general.useragent.override", fake.firefox())
|
|
|
- opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
|
|
|
-
|
|
|
- # 视口配置
|
|
|
- opts.add_argument("--width=1440")
|
|
|
- opts.add_argument("--height=900")
|
|
|
- opts.add_argument("--headless")
|
|
|
- return opts
|
|
|
-
|
|
|
def crawl_by_year_tabs(driver, base_url, year_month):
|
|
|
"""按年份Tab导航采集数据"""
|
|
|
years = ['2023年', '2024年', '2025年']
|
|
|
@@ -387,7 +355,7 @@ def extract_year_month_chinese(text):
|
|
|
return year, month
|
|
|
|
|
|
def crawl_with_selenium(url, mark):
|
|
|
- driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
+ driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
|
|
|
|
|
|
year_month = None
|
|
|
if 'auto' == mark:
|