Explorar o código

crawl add duration

zhangfan hai 1 mes
pai
achega
6b715c9051

+ 6 - 6
crossborder/anhui/crawl_gov_anhui_full.py

@@ -236,7 +236,7 @@ def crawl_with_selenium(url, mark):
             # 获取下一页的URL
             next_page_url = next_page_btn.get_attribute("onclick")
             if not next_page_url:
-                log.info("已到达最后一页,停止爬取")
+                log.info("已到达最后一页,停止采集")
                 break
             # 从onclick属性中提取URL
             next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
@@ -247,7 +247,7 @@ def crawl_with_selenium(url, mark):
             # 访问下一页
             driver.get(next_page_url)
 
-            log.info(f"开始爬取 {next_page_url} 页面数据")
+            log.info(f"开始采集 {next_page_url} 页面数据")
 
     finally:
         driver.quit()
@@ -328,16 +328,16 @@ def main():
 
     start_time = time.time()
     if args.year == 2023:
-        log.info("正在全量爬取安徽省海关数据")
+        log.info("正在全量采集安徽省海关数据")
         crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
         duration = time.time() - start_time
-        send_dingtalk_message(f'安徽省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
+        send_dingtalk_message(f'【安徽省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
     else:
-        log.info("正在增量爬取安徽省海关数据")
+        log.info("正在增量采集安徽省海关数据")
         res = crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
         if res == 'finish':
             duration = time.time() - start_time
-            send_dingtalk_message(f'安徽省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【安徽省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
 
 if __name__ == '__main__':
     main()

+ 6 - 6
crossborder/hebei/crawl_gov_hebei_full.py

@@ -215,7 +215,7 @@ def crawl_with_selenium(url, mark):
             # 获取下一页的URL
             next_page_url = next_page_btn.get_attribute("onclick")
             if not next_page_url:
-                log.info("已到达最后一页,停止爬取")
+                log.info("已到达最后一页,停止采集")
                 break
             # 从onclick属性中提取URL
             next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
@@ -226,7 +226,7 @@ def crawl_with_selenium(url, mark):
             # 访问下一页
             driver.get(next_page_url)
 
-            log.info(f"开始爬取 {next_page_url} 页面数据")
+            log.info(f"开始采集 {next_page_url} 页面数据")
     finally:
         driver.quit()
         # 等待5s后执行
@@ -306,16 +306,16 @@ def main():
 
     start_time = time.time()
     if args.year == 2023:
-        log.info("正在全量爬取河北省海关数据")
+        log.info("正在全量采集河北省海关数据")
         crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'all')
         duration = time.time() - start_time
-        send_dingtalk_message(f'河北省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
+        send_dingtalk_message(f'【河北省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
     else:
-        log.info("正在增量爬取河北省海关数据")
+        log.info("正在增量采集河北省海关数据")
         res = crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html','auto')
         if res == 'finish':
             duration = time.time() - start_time
-            send_dingtalk_message(f'河北省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【河北省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
 
 if __name__ == '__main__':
     main()

+ 6 - 6
crossborder/jiangsu/crawl_gov_jiangsu_full.py

@@ -256,7 +256,7 @@ def crawl_with_selenium(url, mark):
             # 获取下一页的URL
             next_page_url = next_page_btn.get_attribute("onclick")
             if not next_page_url:
-                log.info("已到达最后一页,停止爬取")
+                log.info("已到达最后一页,停止采集")
                 break
             # 从onclick属性中提取URL
             next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
@@ -267,7 +267,7 @@ def crawl_with_selenium(url, mark):
             # 访问下一页
             driver.get(next_page_url)
 
-            log.info(f"开始爬取 {next_page_url} 页面数据")
+            log.info(f"开始采集 {next_page_url} 页面数据")
 
     finally:
         driver.quit()
@@ -343,16 +343,16 @@ def main():
 
     start_time = time.time()
     if args.year == 2023:
-        log.info("正在全量爬取江苏省海关数据")
+        log.info("正在全量采集江苏省海关数据")
         crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
         duration = time.time() - start_time
-        send_dingtalk_message(f'江苏省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
+        send_dingtalk_message(f'【江苏省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
     else:
-        log.info("正在增量爬取江苏省海关数据")
+        log.info("正在增量采集江苏省海关数据")
         res = crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
         if res == 'finish':
             duration = time.time() - start_time
-            send_dingtalk_message(f'江苏省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【江苏省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
 
 if __name__ == '__main__':
     main()

+ 5 - 5
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -56,7 +56,7 @@ def configure_stealth_options():
     return opts
 
 def crawl_by_year_tabs(driver, base_url, year_month):
-    """按年份Tab导航爬取数据"""
+    """按年份Tab导航采集数据"""
     years = ['2023年', '2024年', '2025年']
     WebDriverWait(driver, 30).until(
         EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
@@ -390,16 +390,16 @@ def main():
 
     start_time = time.time()
     if args.year == 2023:
-        log.info("正在全量爬取浙江省海关数据")
+        log.info("正在全量采集浙江省海关数据")
         crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
         duration = time.time() - start_time
-        send_dingtalk_message(f'浙江省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
+        send_dingtalk_message(f'【浙江省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
     else:
-        log.info("正在增量爬取浙江省海关数据")
+        log.info("正在增量采集浙江省海关数据")
         res = crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html','auto')
         if res == 'finish':
             duration = time.time() - start_time
-            send_dingtalk_message(f'浙江省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【浙江省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
 
 if __name__ == '__main__':
     main()