|
@@ -18,6 +18,7 @@ import gov_commodity_zhejiang_city
|
|
|
import gov_commodity_zhejiang_country
|
|
|
import gov_commodity_zhejiang_import_export
|
|
|
from utils import base_country_code, base_mysql
|
|
|
+from utils.log import log
|
|
|
|
|
|
download_dir = base_country_code.download_dir
|
|
|
Path(download_dir).mkdir(parents=True, exist_ok=True)
|
|
@@ -25,7 +26,7 @@ Path(download_dir).mkdir(parents=True, exist_ok=True)
|
|
|
def configure_stealth_options():
|
|
|
"""增强型反检测配置[1,4](@ref)"""
|
|
|
opts = FirefoxOptions()
|
|
|
- print("当前下载路径:", Path(download_dir).resolve())
|
|
|
+ log.info("当前下载路径:", Path(download_dir).resolve())
|
|
|
# 文件下载配置
|
|
|
opts.set_preference("browser.download.dir", download_dir)
|
|
|
opts.set_preference("browser.download.folderList", 2)
|
|
@@ -62,7 +63,7 @@ def crawl_by_year_tabs(driver, base_url):
|
|
|
for tab in year_tabs:
|
|
|
year_text = tab.text.strip()
|
|
|
if int(year_text[:4]) <= 2022:
|
|
|
- print(f"{year_text} 后的数据无需下载")
|
|
|
+ log.info(f"{year_text} 后的数据无需下载")
|
|
|
continue
|
|
|
|
|
|
year_url = tab.get_attribute("href")
|
|
@@ -72,7 +73,7 @@ def crawl_by_year_tabs(driver, base_url):
|
|
|
# 新标签页打开年份页面
|
|
|
driver.execute_script("window.open(arguments[0]);", year_url)
|
|
|
driver.switch_to.window(driver.window_handles[-1])
|
|
|
- print(f"\n正在处理 {year_text} 年份页面")
|
|
|
+ log.info(f"\n正在处理 {year_text} 年份页面")
|
|
|
|
|
|
process_month_tabs(driver, year_text, base_url)
|
|
|
|
|
@@ -98,7 +99,7 @@ def process_month_tabs(driver, year, base_url):
|
|
|
# 全量获取所有月份Tab
|
|
|
month_items = driver.find_elements(By.XPATH, '//ul[@class="nav_tab"]//li')
|
|
|
if not month_items:
|
|
|
- print(f"{year}年没有月份Tab,停止处理")
|
|
|
+ log.info(f"{year}年没有月份Tab,停止处理")
|
|
|
break
|
|
|
|
|
|
all_found = True
|
|
@@ -114,7 +115,7 @@ def process_month_tabs(driver, year, base_url):
|
|
|
if not month_text in target_months:
|
|
|
continue # 跳过已处理月份
|
|
|
|
|
|
- print(f"点击月份Tab:{year}-{month_text}")
|
|
|
+ log.info(f"点击月份Tab:{year}-{month_text}")
|
|
|
a_tag.click()
|
|
|
|
|
|
# 处理详情页逻辑
|
|
@@ -123,9 +124,9 @@ def process_month_tabs(driver, year, base_url):
|
|
|
)
|
|
|
detail_link_arr = get_behind_detail_link(driver, base_url)
|
|
|
if not detail_link_arr:
|
|
|
- print(f"{year}-{month_text} 未找到详情链接")
|
|
|
+ log.info(f"{year}-{month_text} 未找到详情链接")
|
|
|
for detail_link in detail_link_arr:
|
|
|
- print(f"{year}-{month_text} 详情链接:{detail_link}")
|
|
|
+ log.info(f"{year}-{month_text} 详情链接:{detail_link}")
|
|
|
driver.get(detail_link)
|
|
|
download_file_from_detail_page(driver)
|
|
|
driver.back()
|
|
@@ -137,24 +138,24 @@ def process_month_tabs(driver, year, base_url):
|
|
|
found = True
|
|
|
|
|
|
if not found:
|
|
|
- print(f"{year}年未找到 {month_text} Tab")
|
|
|
+ log.info(f"{year}年未找到 {month_text} Tab")
|
|
|
all_found = False
|
|
|
|
|
|
if all_found:
|
|
|
- print(f"{year}年所有目标月份处理完成")
|
|
|
+ log.info(f"{year}年所有目标月份处理完成")
|
|
|
break
|
|
|
else:
|
|
|
# 部分月份未找到,重新获取元素
|
|
|
# retry_count += 1
|
|
|
- print(f"第 {retry_count} 次重试获取月份Tab...")
|
|
|
+ log.info(f"第 {retry_count} 次重试获取月份Tab...")
|
|
|
time.sleep(2)
|
|
|
|
|
|
except StaleElementReferenceException:
|
|
|
- print("页面刷新,重新获取月份Tab列表...")
|
|
|
+ log.info("页面刷新,重新获取月份Tab列表...")
|
|
|
# retry_count += 1
|
|
|
time.sleep(2)
|
|
|
|
|
|
- print(f"{year}年最终处理的月份:{processed_months}")
|
|
|
+ log.info(f"{year}年最终处理的月份:{processed_months}")
|
|
|
|
|
|
def get_behind_detail_link(driver, base_url):
|
|
|
"""获取点击月份Tab后 conList_ul 下所有 li 的 a 标签完整链接"""
|
|
@@ -170,7 +171,7 @@ def get_behind_detail_link(driver, base_url):
|
|
|
href_arr.append(full_url)
|
|
|
return href_arr
|
|
|
except Exception as e:
|
|
|
- print(f"获取详情链接失败: {str(e)}")
|
|
|
+ log.info(f"获取详情链接失败: {str(e)}")
|
|
|
return []
|
|
|
|
|
|
def download_file_from_detail_page(driver):
|
|
@@ -181,7 +182,7 @@ def download_file_from_detail_page(driver):
|
|
|
try:
|
|
|
elements = driver.find_elements(By.XPATH, '//div[@class="easysite-news-content"]//div[@id="easysiteText"]//p//a')
|
|
|
if not elements:
|
|
|
- print("详情页未找到目标文件链接")
|
|
|
+ log.info("详情页未找到目标文件链接")
|
|
|
return
|
|
|
|
|
|
for download_btn in elements:
|
|
@@ -191,10 +192,10 @@ def download_file_from_detail_page(driver):
|
|
|
file_url = download_btn.get_attribute("href")
|
|
|
|
|
|
if not file_url.lower().endswith(('.xls', '.xlsx')):
|
|
|
- print(f"跳过非 Excel 文件: {file_url}")
|
|
|
+ log.info(f"跳过非 Excel 文件: {file_url}")
|
|
|
continue
|
|
|
|
|
|
- print(f"正在下载: {file_name} → {file_url}")
|
|
|
+ log.info(f"正在下载: {file_name} → {file_url}")
|
|
|
|
|
|
# 记录下载前的文件列表
|
|
|
existing_files = set(f.name for f in Path(download_dir).glob('*'))
|
|
@@ -207,17 +208,17 @@ def download_file_from_detail_page(driver):
|
|
|
year, start_month, month = extract_year_and_month(file_name)
|
|
|
final_path = Path(download_dir) / year / month / f"{file_name}"
|
|
|
if os.path.exists(final_path):
|
|
|
- print(f"文件已存在:{file_name} 正在覆盖...")
|
|
|
+ log.info(f"文件已存在:{file_name} 正在覆盖...")
|
|
|
os.unlink(final_path)
|
|
|
|
|
|
final_dir = Path(download_dir) / year / month
|
|
|
final_dir.mkdir(parents=True, exist_ok=True)
|
|
|
- print(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
|
|
|
+ log.info(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
|
|
|
downloaded_file.rename(final_path)
|
|
|
- print(f"√ 下载成功:{final_path}")
|
|
|
+ log.info(f"√ 下载成功:{final_path}")
|
|
|
|
|
|
except Exception as e:
|
|
|
- print(f"详情页处理异常: {str(e)}")
|
|
|
+ log.info(f"详情页处理异常: {str(e)}")
|
|
|
|
|
|
def extract_year_and_month(file_name):
|
|
|
# 支持两种格式:
|
|
@@ -250,10 +251,10 @@ def extract_rar(rar_path, extract_to):
|
|
|
)
|
|
|
|
|
|
if result.returncode == 0:
|
|
|
- print(f"解压成功: {rar_path} → {extract_to}")
|
|
|
+ log.info(f"解压成功: {rar_path} → {extract_to}")
|
|
|
return True
|
|
|
else:
|
|
|
- print(f"解压失败: {result.stderr.decode('gbk')}")
|
|
|
+ log.info(f"解压失败: {result.stderr.decode('gbk')}")
|
|
|
return False
|
|
|
|
|
|
|
|
@@ -320,7 +321,7 @@ def hierarchical_traversal(root_path):
|
|
|
# 按年倒序
|
|
|
for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
|
|
|
# 构造完整的路径:download/shandong/2025/03
|
|
|
- print(f"\n年份:{year_dir.name} | 省份:jiangsu")
|
|
|
+ log.info(f"\n年份:{year_dir.name} | 省份:zhejiang")
|
|
|
|
|
|
# 提取月份目录
|
|
|
month_dirs = []
|
|
@@ -333,20 +334,20 @@ def hierarchical_traversal(root_path):
|
|
|
# 按月倒序输出
|
|
|
if month_dirs:
|
|
|
for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
|
|
|
- print(f" 月份:{md['month']:02d} | 路径:{md['path']}")
|
|
|
+ log.info(f" 月份:{md['month']:02d} | 路径:{md['path']}")
|
|
|
gov_commodity_zhejiang_import_export.process_folder(md['path'])
|
|
|
gov_commodity_zhejiang_country.process_folder(md['path'])
|
|
|
gov_commodity_zhejiang_city.process_folder(md['path'])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html')
|
|
|
- print(f"浙江杭州海关全量数据下载任务完成")
|
|
|
+ log.info(f"浙江杭州海关全量数据下载任务完成")
|
|
|
# 等待5s后执行
|
|
|
time.sleep(5)
|
|
|
hierarchical_traversal(base_country_code.download_dir)
|
|
|
- print("浙江杭州海关类章、国家、城市所有文件处理完成!")
|
|
|
+ log.info("浙江杭州海关类章、国家、城市所有文件处理完成!")
|
|
|
time.sleep(5)
|
|
|
base_mysql.update_january_yoy('浙江省')
|
|
|
base_mysql.update_shandong_yoy('浙江省')
|
|
|
- print("浙江杭州海关城市同比sql处理完成")
|
|
|
+ log.info("浙江杭州海关城市同比sql处理完成")
|
|
|
|