shici
import os import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed import re # 配置网站的基础URL和需要抓取的文章范围 base_url = "http://4.36900.site/cidian/" start_id = 1 end_id = 420000 # 假设你想抓取前10篇文章 def sanitize_filename(filename): """清理文件名中的非法字符""" return re.sub(r'[\\/*?:"|]', "", filename) def fetch_article_content(article_id): url = f"{base_url}{article_id}/" try: response = requests.get(url) if response.status_code == 200 and response.text.strip(): soup = BeautifulSoup(response.text, 'html.parser') # 提取标题 title_tag = soup.find('div', {'class': 'arc-n-a'}) if title_tag and title_tag.span: title = title_tag.span.next_sibling.strip() else: print(f"无法找到文章 {article_id} 的标题") title = f"unnamed_{article_id}" # 获取完整的HTML内容 content = str(soup) return article_id, title, content, None else: print(f"请求失败或页面为空: {response.status_code} for article {article_id}") return article_id, None, None, url except Exception as e: print(f"请求文章 {article_id} 失败: {e}") return article_id, None, None, url def save_to_file(content, title): sanitized_title = sanitize_filename(title) filename = f'{sanitized_title}.txt' with open(filename, 'w', encoding='utf-8') as file: file.write(content) print(f"已保存文章 '{title}' 到文件 {filename}") def save_failed_urls(failed_urls): filename = '404.txt' with open(filename, 'w', encoding='utf-8') as file: for url in failed_urls: file.write(f'{url}\n') print(f"已保存失败的URL到文件 {filename}") def process_article(article_id, failed_urls): article_id, title, content, failed_url = fetch_article_content(article_id) if content: save_to_file(content, title) elif failed_url: failed_urls.append(failed_url) def main(): if not os.path.exists('articles'): os.makedirs('articles') os.chdir('articles') failed_urls = [] # 设置线程池大小 max_workers = 4 # 根据你的需求调整这个值 with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [executor.submit(process_article, i, failed_urls) for i in range(start_id, end_id + 1)] for future in as_completed(futures): try: future.result() except Exception as exc: print(f'处理过程中发生异常: {exc}') if failed_urls: save_failed_urls(failed_urls) if __name__ == '__main__': main()