|
|
@@ -6,6 +6,8 @@ import time
|
|
|
from datetime import datetime, timedelta
|
|
|
from pathlib import Path
|
|
|
|
|
|
+from selenium.webdriver.firefox.service import Service
|
|
|
+from webdriver_manager.firefox import GeckoDriverManager
|
|
|
from faker import Faker
|
|
|
from selenium import webdriver
|
|
|
from selenium.webdriver import FirefoxOptions
|
|
|
@@ -20,6 +22,8 @@ from crossborder.utils import base_country_code, base_mysql
|
|
|
from crossborder.utils.base_country_code import extract_year_month
|
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
|
from crossborder.utils.log import get_logger
|
|
|
+import urllib.request
|
|
|
+import urllib.error
|
|
|
|
|
|
log = get_logger(__name__)
|
|
|
|
|
|
@@ -190,24 +194,43 @@ def detect_latest_month(driver, url):
|
|
|
continue
|
|
|
return f"{check_year}年{check_month}月"
|
|
|
except:
|
|
|
+ # except Exception as e:
|
|
|
log.error(f"未找到 {target_title}")
|
|
|
+ # log.error(f"未找到 {target_title} {e}")
|
|
|
continue
|
|
|
log.error("三个月内未找到有效数据")
|
|
|
return None
|
|
|
|
|
|
+def check_internet_connection(url="http://www.baidu.com", timeout=5):
|
|
|
+ """检查网络连接"""
|
|
|
+ try:
|
|
|
+ urllib.request.urlopen(url, timeout=timeout)
|
|
|
+ return True
|
|
|
+ except urllib.error.URLError:
|
|
|
+ return False
|
|
|
+
|
|
|
def crawl_with_selenium(url, mark):
|
|
|
- driver = webdriver.Firefox(options=configure_stealth_options())
|
|
|
+ if not check_internet_connection():
|
|
|
+ log.error("无法连接到互联网,请检查网络设置")
|
|
|
+ raise Exception("网络连接失败")
|
|
|
|
|
|
+ driver = None
|
|
|
year_month = None
|
|
|
- if 'auto' == mark:
|
|
|
- res = detect_latest_month(driver, url)
|
|
|
- if res is None:
|
|
|
- log.info("安徽省海关没有最新数据更新")
|
|
|
- return None
|
|
|
- year_month = res
|
|
|
- print(f"检测到最新有效数据:{year_month}")
|
|
|
-
|
|
|
try:
|
|
|
+ # 使用WebDriverManager自动管理geckodriver
|
|
|
+ service = Service(GeckoDriverManager().install())
|
|
|
+ driver = webdriver.Firefox(service=service, options=configure_stealth_options())
|
|
|
+ log.info("Firefox WebDriver初始化成功")
|
|
|
+
|
|
|
+ year_month = None
|
|
|
+ if 'auto' == mark:
|
|
|
+ res = detect_latest_month(driver, url)
|
|
|
+ if res is None:
|
|
|
+ log.info("安徽省海关没有最新数据更新")
|
|
|
+ return None
|
|
|
+ year_month = res
|
|
|
+ print(f"检测到最新有效数据:{year_month}")
|
|
|
+
|
|
|
# 注入反检测脚本
|
|
|
driver.execute_script("""
|
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
|
@@ -251,13 +274,14 @@ def crawl_with_selenium(url, mark):
|
|
|
log.info(f"开始采集 {next_page_url} 页面数据")
|
|
|
|
|
|
finally:
|
|
|
- driver.quit()
|
|
|
+ if driver:
|
|
|
+ driver.quit()
|
|
|
print(f"安徽省合肥海关全量数据下载任务完成")
|
|
|
# 等待5s后执行
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(3)
|
|
|
hierarchical_traversal(download_dir, year_month)
|
|
|
print("安徽省海关类章、国家、城市所有文件处理完成!")
|
|
|
- time.sleep(5)
|
|
|
+ time.sleep(3)
|
|
|
base_mysql.update_shandong_yoy('安徽省')
|
|
|
print("安徽省合肥海关城市同比sql处理完成")
|
|
|
return 'finish', year_month
|
|
|
@@ -353,7 +377,8 @@ def main():
|
|
|
send_dingtalk_message(f'【安徽省海关】 {r2} 增量数据采集完成,{int(minutes)}分{seconds:.1f}秒')
|
|
|
|
|
|
except Exception as e:
|
|
|
- send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
|
|
|
+ # send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
|
|
|
+ log.error(f'【安徽省海关】发生错误:{e}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|