|
@@ -1,8 +1,9 @@
|
|
|
import os
|
|
|
import random
|
|
|
import re
|
|
|
-import subprocess
|
|
|
+import sys
|
|
|
import time
|
|
|
+from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
|
|
|
|
from faker import Faker
|
|
@@ -16,6 +17,7 @@ import gov_commodity_anhui_city
|
|
|
import gov_commodity_anhui_country
|
|
|
import gov_commodity_anhui_import_export
|
|
|
from utils import base_country_code, base_mysql
|
|
|
+from utils.log import log
|
|
|
|
|
|
download_dir = base_country_code.download_dir
|
|
|
Path(download_dir).mkdir(parents=True, exist_ok=True)
|
|
@@ -49,7 +51,7 @@ def configure_stealth_options():
|
|
|
opts.add_argument("--headless")
|
|
|
return opts
|
|
|
|
|
|
-def find_target_links(driver):
|
|
|
+def find_target_links(driver, year_month):
|
|
|
"""点击列表页链接进入详情页下载文件"""
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
EC.presence_of_element_located((By.ID, "conRight"))
|
|
@@ -70,11 +72,11 @@ def find_target_links(driver):
|
|
|
# 新标签页打开链接
|
|
|
driver.execute_script("window.open(arguments[0]);", link_url)
|
|
|
driver.switch_to.window(driver.window_handles[-1])
|
|
|
- print(f"正在处理详情页: {link_url}")
|
|
|
+ log.info(f"正在处理详情页: {link_url}")
|
|
|
|
|
|
try:
|
|
|
# 在详情页下载文件
|
|
|
- download_result = download_file_from_detail_page(driver)
|
|
|
+ download_result = download_file_from_detail_page(driver, year_month)
|
|
|
if download_result == 'stop':
|
|
|
return 'stop'
|
|
|
processed_urls.add(link_url)
|
|
@@ -87,9 +89,9 @@ def find_target_links(driver):
|
|
|
|
|
|
return None
|
|
|
except Exception as e:
|
|
|
- print(f"下载时发生异常: {str(e)}")
|
|
|
+ log.info(f"下载时发生异常: {str(e)}")
|
|
|
|
|
|
-def download_file_from_detail_page(driver):
|
|
|
+def download_file_from_detail_page(driver, year_month):
|
|
|
WebDriverWait(driver, 30).until(
|
|
|
EC.presence_of_element_located((By.ID, "easysiteText"))
|
|
|
)
|
|
@@ -97,17 +99,22 @@ def download_file_from_detail_page(driver):
|
|
|
try:
|
|
|
elements = driver.find_elements(By.XPATH, '//div[@id="easysiteText"]//a')
|
|
|
if not elements:
|
|
|
- print("详情页未找到目标文件链接")
|
|
|
+ log.info("详情页未找到目标文件链接")
|
|
|
return None
|
|
|
|
|
|
for download_btn in elements:
|
|
|
file_name = download_btn.text.strip()
|
|
|
if not file_name:
|
|
|
continue
|
|
|
- if file_name.startswith('2022'):
|
|
|
- return 'stop'
|
|
|
+ if year_month is None:
|
|
|
+ if file_name.startswith('2022'):
|
|
|
+ return 'stop'
|
|
|
+ else:
|
|
|
+ if not file_name.startswith(year_month):
|
|
|
+ log.info(f"非 {year_month} 文件: {file_name}, stop")
|
|
|
+ return 'stop'
|
|
|
if '美元' in file_name or '商品贸易方式' in file_name or '进出口总值' in file_name or '月度表' in file_name:
|
|
|
- print(f'{file_name} 不需要此文件,跳过')
|
|
|
+ log.info(f'{file_name} 不需要此文件,跳过')
|
|
|
continue
|
|
|
|
|
|
file_url = download_btn.get_attribute("href")
|
|
@@ -116,10 +123,10 @@ def download_file_from_detail_page(driver):
|
|
|
file_url = base_url + file_url
|
|
|
|
|
|
if not file_url.lower().endswith(('.xls', '.xlsx')):
|
|
|
- print(f"跳过非 Excel 文件: {file_url}")
|
|
|
+ log.info(f"跳过非 Excel 文件: {file_url}")
|
|
|
continue
|
|
|
|
|
|
- print(f"正在下载: {file_name} → {file_url}")
|
|
|
+ log.info(f"正在下载: {file_name} → {file_url}")
|
|
|
|
|
|
# 记录下载前的文件列表
|
|
|
existing_files = set(f.name for f in Path(download_dir).glob('*'))
|
|
@@ -132,18 +139,18 @@ def download_file_from_detail_page(driver):
|
|
|
year, start_month, month = extract_year_and_month(file_name)
|
|
|
final_path = Path(download_dir) / year / month / f"{file_name}"
|
|
|
if os.path.exists(final_path):
|
|
|
- print(f"文件已存在:{file_name} 正在覆盖...")
|
|
|
+ log.info(f"文件已存在:{file_name} 正在覆盖...")
|
|
|
os.unlink(final_path)
|
|
|
|
|
|
final_dir = Path(download_dir) / year / month
|
|
|
final_dir.mkdir(parents=True, exist_ok=True)
|
|
|
- print(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
|
|
|
+ log.info(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
|
|
|
downloaded_file.rename(final_path)
|
|
|
- print(f"√ 下载成功:{final_path} \n")
|
|
|
+ log.info(f"√ 下载成功:{final_path} \n")
|
|
|
|
|
|
return None
|
|
|
except Exception as e:
|
|
|
- print(f"详情页处理异常: {str(e)}")
|
|
|
+ log.info(f"详情页处理异常: {str(e)}")
|
|
|
return None
|
|
|
|
|
|
def extract_year_and_month(file_name):
|
|
@@ -161,32 +168,44 @@ def extract_year_and_month(file_name):
|
|
|
else:
|
|
|
raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
|
|
|
|
|
|
-def extract_rar(rar_path, extract_to):
|
|
|
- """备用解压函数(当 rarfile 失效时使用)"""
|
|
|
- winrar_path = r"C:\Program Files\WinRAR\Rar.exe" # 推荐使用 Rar.exe 而非 WinRAR.exe
|
|
|
- cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)]
|
|
|
-
|
|
|
- # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口
|
|
|
- creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0
|
|
|
-
|
|
|
- result = subprocess.run(
|
|
|
- cmd,
|
|
|
- stdout=subprocess.PIPE,
|
|
|
- stderr=subprocess.PIPE,
|
|
|
- creationflags=creationflags # 关键点:隐藏窗口
|
|
|
- )
|
|
|
-
|
|
|
- if result.returncode == 0:
|
|
|
- print(f"解压成功: {rar_path} → {extract_to}")
|
|
|
- return True
|
|
|
- else:
|
|
|
- print(f"解压失败: {result.stderr.decode('gbk')}")
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-def crawl_with_selenium(url):
|
|
|
+def detect_latest_month(driver, url):
|
|
|
+ driver.get(url)
|
|
|
+ current_date = datetime.now()
|
|
|
+ for offset in range(0, 3):
|
|
|
+ check_date = current_date - timedelta(days=offset * 30)
|
|
|
+ check_year = check_date.year
|
|
|
+ check_month = check_date.month
|
|
|
+
|
|
|
+ target_title = f"{check_year}年{check_month}月"
|
|
|
+ try:
|
|
|
+ WebDriverWait(driver, 10).until(
|
|
|
+ EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
|
|
|
+ )
|
|
|
+ log.info(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
|
+ # 看是否已存表,已存则跳过;
|
|
|
+ count = base_mysql.get_code_exist(f'{check_year}-{check_month:02d}', "340000")
|
|
|
+ if count > 0:
|
|
|
+ log.info(f"已存在 {check_year}-{check_month} 数据,跳过")
|
|
|
+ continue
|
|
|
+ return f"{check_year}年{check_month}月"
|
|
|
+ except:
|
|
|
+ log.info(f"未找到 {target_title}")
|
|
|
+ continue
|
|
|
+ log.info("三个月内未找到有效数据")
|
|
|
+ return None
|
|
|
+
|
|
|
+def crawl_with_selenium(url, mark):
|
|
|
driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
|
|
|
+ year_month = None
|
|
|
+ if 'increment' == mark:
|
|
|
+ res = detect_latest_month(driver, url)
|
|
|
+ if res is None:
|
|
|
+ log.info("安徽省海关没有最新数据更新")
|
|
|
+ sys.exit(0)
|
|
|
+ year_month = res
|
|
|
+ print(f"检测到最新有效数据:{year_month}")
|
|
|
+
|
|
|
try:
|
|
|
# 注入反检测脚本
|
|
|
driver.execute_script("""
|
|
@@ -201,7 +220,7 @@ def crawl_with_selenium(url):
|
|
|
|
|
|
while True:
|
|
|
# 访问当前页
|
|
|
- result = find_target_links(driver)
|
|
|
+ result = find_target_links(driver, year_month)
|
|
|
if result == 'stop':
|
|
|
break
|
|
|
|
|
@@ -217,7 +236,7 @@ def crawl_with_selenium(url):
|
|
|
# 获取下一页的URL
|
|
|
next_page_url = next_page_btn.get_attribute("onclick")
|
|
|
if not next_page_url:
|
|
|
- print("已到达最后一页,停止爬取")
|
|
|
+ log.info("已到达最后一页,停止爬取")
|
|
|
break
|
|
|
# 从onclick属性中提取URL
|
|
|
next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
|
|
@@ -228,7 +247,7 @@ def crawl_with_selenium(url):
|
|
|
# 访问下一页
|
|
|
driver.get(next_page_url)
|
|
|
|
|
|
- print(f"开始爬取 {next_page_url} 页面数据")
|
|
|
+ log.info(f"开始爬取 {next_page_url} 页面数据")
|
|
|
|
|
|
finally:
|
|
|
driver.quit()
|
|
@@ -274,7 +293,7 @@ def hierarchical_traversal(root_path):
|
|
|
# 按年倒序
|
|
|
for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
|
|
|
# 构造完整的路径:download/shandong/2025/03
|
|
|
- print(f"\n年份:{year_dir.name} | 省份:jiangsu")
|
|
|
+ print(f"\n年份:{year_dir.name} | 省份:anhui")
|
|
|
|
|
|
# 提取月份目录
|
|
|
month_dirs = []
|
|
@@ -293,7 +312,8 @@ def hierarchical_traversal(root_path):
|
|
|
gov_commodity_anhui_city.process_folder(md['path'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html')
|
|
|
+ crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'all')
|
|
|
+ # crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'increment')
|
|
|
print(f"安徽合肥海关全量数据下载任务完成")
|
|
|
# 等待5s后执行
|
|
|
time.sleep(5)
|