download_utils.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import threading
  2. import time
  3. import traceback
  4. from pathlib import Path
  5. from faker import Faker
  6. from selenium.webdriver import FirefoxOptions
  7. from selenium.webdriver.common.by import By
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from utils.log import log
  11. DOWNLOAD_TIMEOUT = 60
  12. download_lock = threading.Lock()
  13. def configure_stealth_options(download_dir):
  14. """反检测浏览器配置"""
  15. opts = FirefoxOptions()
  16. opts.set_preference("dom.webdriver.enabled", False)
  17. opts.add_argument("--disable-blink-features=AutomationControlled")
  18. opts.add_argument("--headless")
  19. opts.set_preference("general.useragent.override", Faker().user_agent())
  20. opts.set_preference("browser.download.dir", str(download_dir))
  21. opts.set_preference("browser.download.folderList", 2)
  22. opts.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.ms-excel")
  23. opts.set_preference("pdfjs.disabled", True)
  24. opts.set_preference("download.prompt_for_download", False)
  25. opts.set_preference("download.directory_upgrade", True)
  26. return opts
  27. def generate_month_sequence(start_year, start_month, end_year=None, skip_january=False):
  28. """
  29. 动态生成倒序月份序列
  30. Args:
  31. start_year (int): 检测到的最新数据年份
  32. start_month (int): 检测到的最新数据月份
  33. end_year (int, optional): 终止年份(默认 None)
  34. skip_january (bool, optional): 是否跳过所有1月数据(默认 False)
  35. Returns:
  36. List[Tuple[int, int]]: 月份序列列表,格式为 [(year, month), ...]
  37. """
  38. sequence = []
  39. current_year = start_year
  40. current_month = start_month
  41. # 当指定终止年份时
  42. if end_year:
  43. while not (current_year == end_year and current_month < 1):
  44. # 跳过1月判断
  45. if not (skip_january and current_month == 1):
  46. sequence.append((current_year, current_month))
  47. # 跨年处理
  48. if current_month == 1:
  49. current_year -= 1
  50. current_month = 12
  51. else:
  52. current_month -= 1
  53. # 终止条件:到达目标年份的1月
  54. if current_year < end_year:
  55. break
  56. else:
  57. # 未指定年份时取最近两个月
  58. if not (skip_january and current_month == 1):
  59. sequence.append((current_year, current_month))
  60. prev_year, prev_month = get_previous_month(current_year, current_month)
  61. if not (skip_january and prev_month == 1):
  62. sequence.append((prev_year, prev_month))
  63. return sequence
  64. def get_previous_month(year, month):
  65. """跨年月份计算"""
  66. if month == 1:
  67. return year - 1, 12
  68. return year, month - 1
  69. def download_excel(driver, url, year, month, title, download_dir):
  70. """文件下载模块"""
  71. download_dir.mkdir(parents=True, exist_ok=True)
  72. driver.execute_script(f"window.open('{url}')")
  73. driver.switch_to.window(driver.window_handles[-1])
  74. try:
  75. download_btn = WebDriverWait(driver, 20).until(
  76. EC.element_to_be_clickable(
  77. (By.XPATH,
  78. '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
  79. )
  80. )
  81. download_btn.click()
  82. # log.info(f"√ 已点击下载按钮:{download_btn.get_attribute("href")}")
  83. downloaded_file = wait_for_download(download_dir)
  84. final_path = Path(f'{download_dir}/{year}/{month:02d}/{title}{downloaded_file.suffix}')
  85. if final_path.exists():
  86. final_path.unlink()
  87. download_rel_dir = Path(f'{download_dir}/{year}/{month:02d}')
  88. download_rel_dir.mkdir(parents=True, exist_ok=True)
  89. downloaded_file.rename(final_path)
  90. log.info(f"√ 文件已保存至:{final_path}")
  91. finally:
  92. driver.close()
  93. driver.switch_to.window(driver.window_handles[0])
  94. def batch_download_excel(driver, url, year, month, base_title, download_dir):
  95. """批量下载Excel文件模块"""
  96. download_dir.mkdir(parents=True, exist_ok=True)
  97. driver.execute_script(f"window.open('{url}')")
  98. driver.switch_to.window(driver.window_handles[-1])
  99. try:
  100. # 获取所有Excel下载按钮
  101. download_btns = WebDriverWait(driver, 20).until(
  102. EC.presence_of_all_elements_located(
  103. (By.XPATH,
  104. '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
  105. )
  106. )
  107. for index, btn in enumerate(download_btns):
  108. # 生成唯一标题(可自定义规则)
  109. title = f"{base_title}_{index + 1}"
  110. # 点击下载按钮
  111. btn.click()
  112. # 等待下载完成
  113. downloaded_file = wait_for_download(download_dir)
  114. # 处理文件路径
  115. final_dir = download_dir / f'{year}' / f'{month:02d}'
  116. final_dir.mkdir(parents=True, exist_ok=True)
  117. final_path = final_dir / f'{title}{downloaded_file.suffix}'
  118. # 重命名文件
  119. if final_path.exists():
  120. final_path.unlink()
  121. downloaded_file.rename(final_path)
  122. log.info(f"√ 文件 {title} 已保存至:{final_path}")
  123. finally:
  124. driver.close()
  125. driver.switch_to.window(driver.window_handles[0])
  126. def download_excel2(driver, link, year, month, title, download_dir):
  127. download_dir = Path(download_dir)
  128. download_dir.mkdir(parents=True, exist_ok=True)
  129. try:
  130. log.info(f"正在点击链接:{title}")
  131. link.click()
  132. log.info("等待文件下载完成...")
  133. downloaded_file = wait_for_download(download_dir)
  134. if not downloaded_file.suffix:
  135. downloaded_file = downloaded_file.with_suffix('.xlsx')
  136. final_dir = download_dir / f'{year}' / f'{month:02d}'
  137. final_dir.mkdir(parents=True, exist_ok=True)
  138. final_path = final_dir / f'{title}{downloaded_file.suffix}'
  139. if final_path.exists():
  140. final_path.unlink()
  141. downloaded_file.rename(final_path)
  142. log.info(f"√ 文件已保存至:{final_path}")
  143. except TimeoutError as te:
  144. log.info(f"[错误] 文件下载超时:{te}")
  145. raise
  146. except Exception as e:
  147. log.info(f"[错误] 发生异常:{e}")
  148. log.info(traceback.format_exc())
  149. raise
  150. # def wait_for_download(directory):
  151. # """文件下载监控(只读取文件,忽略文件夹)"""
  152. # start_time = time.time()
  153. # while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
  154. # files = [
  155. # f for f in directory.glob('*')
  156. # if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
  157. # ]
  158. # if files:
  159. # # 按照创建时间排序并返回最新文件
  160. # return max(files, key=lambda x: x.stat().st_ctime)
  161. # time.sleep(1)
  162. # raise TimeoutError("文件下载超时")
  163. def wait_for_download(directory):
  164. """文件下载监控(只读取文件,忽略文件夹)"""
  165. start_time = time.time()
  166. while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
  167. with download_lock:
  168. files = [
  169. f for f in directory.glob('*')
  170. if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
  171. ]
  172. if files:
  173. # 按照创建时间排序并返回最新文件
  174. return max(files, key=lambda x: x.stat().st_ctime)
  175. time.sleep(1)
  176. raise TimeoutError("文件下载超时")