Windows GUI版本搜索爬虫源码

import os
import requests
from datetime import datetime
import json
from requests import get
from bs4 import BeautifulSoup
import concurrent.futures
from html.parser import HTMLParser
from urllib.parse import urlparse, urljoin
import re
import unicodedata
from pydantic import BaseModel, Field
import asyncio
from tkinter import Tk, Label, Entry, Button, Text, END, StringVar, IntVar, Checkbutton, filedialog

class HelpFunctions:
    def get_base_url(self, url):
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url

    def generate_excerpt(self, content, max_length=200):
        return content[:max_length] + "..." if len(content) > max_length else content

    def format_text(self, original_text):
        soup = BeautifulSoup(original_text, "html.parser")
        formatted_text = soup.get_text(separator=" ", strip=True)
        formatted_text = unicodedata.normalize("NFKC", formatted_text)
        formatted_text = re.sub(r"\\s+", " ", formatted_text)
        formatted_text = formatted_text.strip()
        formatted_text = self.remove_emojis(formatted_text)
        return formatted_text

    def remove_emojis(self, text):
        return "".join(c for c in text if not unicodedata.category(c).startswith("So"))

    def process_search_result(self, result, valves):
        title_site = self.remove_emojis(result["title"])
        url_site = result["url"]
        snippet = result.get("content", "")

        if valves.IGNORED_WEBSITES:
            base_url = self.get_base_url(url_site)
            if any(ignored_site.strip() in base_url for ignored_site in valves.IGNORED_WEBSITES.split(",")):
                return None

        try:
            response_site = requests.get(url_site, timeout=20)
            response_site.raise_for_status()
            html_content = response_site.text

            soup = BeautifulSoup(html_content, "html.parser")
            content_site = self.format_text(soup.get_text(separator=" ", strip=True))

            truncated_content = self.truncate_to_n_words(content_site, valves.PAGE_CONTENT_WORDS_LIMIT)

            return {
                "title": title_site,
                "url": url_site,
                "content": truncated_content,
                "snippet": self.remove_emojis(snippet),
            }
        except requests.exceptions.RequestException:
            return None

    def truncate_to_n_words(self, text, token_limit):
        tokens = text.split()
        truncated_tokens = tokens[:token_limit]
        return " ".join(truncated_tokens)

class Tools:
    class Valves(BaseModel):
        SEARXNG_ENGINE_API_BASE_URL: str = Field(
            default="http://127.0.0.1:8080/search",
            description="搜索引擎的基本URL",
        )
        IGNORED_WEBSITES: str = Field(
            default="",
            description="要忽略的网站的逗号分隔列表",
        )
        RETURNED_SCRAPPED_PAGES_NO: int = Field(
            default=10,
            description="要解析的搜索引擎结果数",
        )
        SCRAPPED_PAGES_NO: int = Field(
            default=10,
            description="抓取的总页面数。理想情况下应大于返回的页面数",
        )
        PAGE_CONTENT_WORDS_LIMIT: int = Field(
            default=1000,
            description="每个页面的内容字数限制",
        )
        CITATION_LINKS: bool = Field(
            default=False,
            description="如果为True,则发送带有链接的自定义引用",
        )

    def __init__(self):
        self.valves = self.Valves()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
        }

    async def search_web(self, query: str) -> str:
        functions = HelpFunctions()
        search_engine_url = self.valves.SEARXNG_ENGINE_API_BASE_URL

        if self.valves.RETURNED_SCRAPPED_PAGES_NO > self.valves.SCRAPPED_PAGES_NO:
            self.valves.RETURNED_SCRAPPED_PAGES_NO = self.valves.SCRAPPED_PAGES_NO

        params = {
            "q": query,
            "format": "json",
            "number_of_results": self.valves.RETURNED_SCRAPPED_PAGES_NO,
        }

        try:
            resp = requests.get(search_engine_url, params=params, headers=self.headers, timeout=120)
            resp.raise_for_status()
            data = resp.json()

            results = data.get("results", [])
            limited_results = results[:self.valves.SCRAPPED_PAGES_NO]

        except requests.exceptions.RequestException as e:
            return json.dumps({"error": str(e)})

        results_json = []
        if limited_results:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = [executor.submit(functions.process_search_result, result, self.valves) for result in limited_results]
                for future in concurrent.futures.as_completed(futures):
                    result_json = future.result()
                    if result_json:
                        try:
                            json.dumps(result_json)
                            results_json.append(result_json)
                        except (TypeError, ValueError):
                            continue
                    if len(results_json) >= self.valves.RETURNED_SCRAPPED_PAGES_NO:
                        break

            results_json = results_json[:self.valves.RETURNED_SCRAPPED_PAGES_NO]

        return json.dumps(results_json, ensure_ascii=False)

async def run_search(tools, query, output_text):
    result_json = await tools.search_web(query)
    output_text.delete(1.0, END)
    output_text.insert(END, result_json)

def save_to_file(content):
    file_path = filedialog.asksaveasfilename(defaultextension=".txt",
                                             filetypes=[("Text files", "*.txt"),
                                                        ("All files", "*.*")])
    if file_path:
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(content)

def start_gui():
    tools = Tools()

    root = Tk()
    root.title("网络搜索")
    root.geometry("600x500")

    # 查询标签和输入框
    query_label = Label(root, text="查询内容:")
    query_label.pack()

    query_entry = Entry(root, width=50)
    query_entry.pack()

    Label(root, text="搜索引擎 API URL:").pack()
    engine_url_entry = Entry(root, width=50)
    engine_url_entry.insert(0, tools.valves.SEARXNG_ENGINE_API_BASE_URL)
    engine_url_entry.pack()

    Label(root, text="忽略网站 (用逗号分隔):").pack()
    ignored_sites_entry = Entry(root, width=50)
    ignored_sites_entry.pack()

    results_limit_label = Label(root, text="搜索结果数:")
    results_limit_label.pack()

    results_limit = IntVar(value=tools.valves.RETURNED_SCRAPPED_PAGES_NO)
    results_limit_entry = Entry(root, textvariable=results_limit)
    results_limit_entry.pack()

    page_limit_label = Label(root, text="每页内容字数限制:")
    page_limit_label.pack()

    page_limit = IntVar(value=tools.valves.PAGE_CONTENT_WORDS_LIMIT)
    page_limit_entry = Entry(root, textvariable=page_limit)
    page_limit_entry.pack()

    citation_var = IntVar()
    citation_checkbox = Checkbutton(root, text="启用引用链接", variable=citation_var)
    citation_checkbox.pack()

    output_text = Text(root, height=10, width=70)
    output_text.pack()

    def search_action():
        query = query_entry.get()
        tools.valves.SEARXNG_ENGINE_API_BASE_URL = engine_url_entry.get()
        tools.valves.IGNORED_WEBSITES = ignored_sites_entry.get()
        tools.valves.RETURNED_SCRAPPED_PAGES_NO = results_limit.get()
        tools.valves.PAGE_CONTENT_WORDS_LIMIT = page_limit.get()
        tools.valves.CITATION_LINKS = bool(citation_var.get())

        asyncio.run(run_search(tools, query, output_text))

    def save_action():
        content = output_text.get(1.0, END).strip()
        if content:
            save_to_file(content)

    search_button = Button(root, text="搜索", command=search_action)
    search_button.pack()

    save_button = Button(root, text="保存结果", command=save_action)
    save_button.pack()

    root.mainloop()

if __name__ == "__main__":
    start_gui()

命令行版

25 个赞

大佬太高产了 :kissing_heart:

3 个赞

用不到,但是给大佬来个赞

2 个赞

收藏一下先̋(ˊ•͈ꇴ•͈ˋ)

好多硬货啊佬友果然不简单

佬友太强了!

佬友太强了

大佬厉害呀

大佬太强了!

感谢佬友分享