|
@@ -1,8 +1,9 @@
|
|
|
import os
|
|
|
import random
|
|
|
import re
|
|
|
-import subprocess
|
|
|
+import sys
|
|
|
import time
|
|
|
+from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
|
|
|
|
from faker import Faker
|
|
@@ -24,7 +25,7 @@ Path(download_dir).mkdir(parents=True, exist_ok=True)
|
|
|
def configure_stealth_options():
|
|
|
"""增强型反检测配置[1,4](@ref)"""
|
|
|
opts = FirefoxOptions()
|
|
|
- log.info("当前下载路径:", Path(download_dir).resolve())
|
|
|
+ print("当前下载路径:", Path(download_dir).resolve())
|
|
|
# 文件下载配置
|
|
|
opts.set_preference("browser.download.dir", download_dir)
|
|
|
opts.set_preference("browser.download.folderList", 2)
|
|
@@ -50,7 +51,7 @@ def configure_stealth_options():
|
|
|
opts.add_argument("--headless")
|
|
|
return opts
|
|
|
|
|
|
-def find_target_links(driver):
|
|
|
+def find_target_links(driver, year_month):
|
|
|
"""点击列表页链接进入详情页下载文件"""
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
EC.presence_of_element_located((By.ID, "conRight"))
|
|
@@ -75,7 +76,7 @@ def find_target_links(driver):
|
|
|
|
|
|
try:
|
|
|
# 在详情页下载文件
|
|
|
- download_result = download_file_from_detail_page(driver)
|
|
|
+ download_result = download_file_from_detail_page(driver, year_month)
|
|
|
if download_result == 'stop':
|
|
|
return 'stop'
|
|
|
processed_urls.add(link_url)
|
|
@@ -90,7 +91,7 @@ def find_target_links(driver):
|
|
|
except Exception as e:
|
|
|
log.info(f"下载时发生异常: {str(e)}")
|
|
|
|
|
|
-def download_file_from_detail_page(driver):
|
|
|
+def download_file_from_detail_page(driver, year_month):
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
EC.presence_of_element_located((By.ID, "easysiteText"))
|
|
|
)
|
|
@@ -105,8 +106,13 @@ def download_file_from_detail_page(driver):
|
|
|
file_name = download_btn.text.strip()
|
|
|
if not file_name:
|
|
|
continue
|
|
|
- if file_name.startswith('2022'):
|
|
|
- return 'stop'
|
|
|
+ if year_month is None:
|
|
|
+ if file_name.startswith('2022'):
|
|
|
+ return 'stop'
|
|
|
+ else:
|
|
|
+ if not file_name.startswith(year_month):
|
|
|
+ log.info(f"非 {year_month} 文件: {file_name}, stop")
|
|
|
+ return 'stop'
|
|
|
if '美元' in file_name or '商品贸易方式' in file_name or '进出口总值' in file_name or '月度表' in file_name:
|
|
|
log.info(f'{file_name} 不需要此文件,跳过')
|
|
|
continue
|
|
@@ -162,32 +168,44 @@ def extract_year_and_month(file_name):
|
|
|
else:
|
|
|
raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
|
|
|
|
|
|
-def extract_rar(rar_path, extract_to):
|
|
|
- """备用解压函数(当 rarfile 失效时使用)"""
|
|
|
- winrar_path = r"C:\Program Files\WinRAR\Rar.exe" # 推荐使用 Rar.exe 而非 WinRAR.exe
|
|
|
- cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)]
|
|
|
-
|
|
|
- # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口
|
|
|
- creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0
|
|
|
-
|
|
|
- result = subprocess.run(
|
|
|
- cmd,
|
|
|
- stdout=subprocess.PIPE,
|
|
|
- stderr=subprocess.PIPE,
|
|
|
- creationflags=creationflags # 关键点:隐藏窗口
|
|
|
- )
|
|
|
-
|
|
|
- if result.returncode == 0:
|
|
|
- log.info(f"解压成功: {rar_path} → {extract_to}")
|
|
|
- return True
|
|
|
- else:
|
|
|
- log.info(f"解压失败: {result.stderr.decode('gbk')}")
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-def crawl_with_selenium(url):
|
|
|
+def detect_latest_month(driver, url):
|
|
|
+ driver.get(url)
|
|
|
+ current_date = datetime.now()
|
|
|
+ for offset in range(0, 3):
|
|
|
+ check_date = current_date - timedelta(days=offset * 30)
|
|
|
+ check_year = check_date.year
|
|
|
+ check_month = check_date.month
|
|
|
+
|
|
|
+ target_title = f"{check_year}年{check_month}月"
|
|
|
+ try:
|
|
|
+ WebDriverWait(driver, 10).until(
|
|
|
+ EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
|
|
|
+ )
|
|
|
+ log.info(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
|
+ # 看是否已存表,已存则跳过;
|
|
|
+ count = base_mysql.get_code_exist(f'{check_year}-{check_month:02d}', "340000")
|
|
|
+ if count > 0:
|
|
|
+ log.info(f"已存在 {check_year}-{check_month} 数据,跳过")
|
|
|
+ continue
|
|
|
+ return f"{check_year}年{check_month}月"
|
|
|
+ except:
|
|
|
+ log.info(f"未找到 {target_title}")
|
|
|
+ continue
|
|
|
+ log.info("三个月内未找到有效数据")
|
|
|
+ return None
|
|
|
+
|
|
|
+def crawl_with_selenium(url, mark):
|
|
|
driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
|
|
|
+ year_month = None
|
|
|
+ if 'increment' == mark:
|
|
|
+ res = detect_latest_month(driver, url)
|
|
|
+ if res is None:
|
|
|
+ log.info("安徽省海关没有最新数据更新")
|
|
|
+ sys.exit(0)
|
|
|
+ year_month = res
|
|
|
+ print(f"检测到最新有效数据:{year_month}")
|
|
|
+
|
|
|
try:
|
|
|
# 注入反检测脚本
|
|
|
driver.execute_script("""
|
|
@@ -202,7 +220,7 @@ def crawl_with_selenium(url):
|
|
|
|
|
|
while True:
|
|
|
# 访问当前页
|
|
|
- result = find_target_links(driver)
|
|
|
+ result = find_target_links(driver, year_month)
|
|
|
if result == 'stop':
|
|
|
break
|
|
|
|
|
@@ -294,7 +312,8 @@ def hierarchical_traversal(root_path):
|
|
|
gov_commodity_anhui_city.process_folder(md['path'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html')
|
|
|
+ crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'all')
|
|
|
+ # crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'increment')
|
|
|
print(f"安徽合肥海关全量数据下载任务完成")
|
|
|
# 等待5s后执行
|
|
|
time.sleep(5)
|