【爬虫】批量采集美女头像

Yuccc · 2025 年1 月 14 日 02:24

背景：公司需要大量美女头像用来批量改头像

分享下代码,顺便水一下
代码改一改也可以获取别的类型的头像，不一定局限于美女

第1个

import requests
from bs4 import BeautifulSoup
import os
import threading

# 下载单张图片
def download_image(img_url, save_dir):
    try:
        img_data = requests.get(img_url, timeout=10).content
        img_name = os.path.join(save_dir, img_url.split("/")[-1])
        with open(img_name, "wb") as f:
            f.write(img_data)
        print(f"下载成功: {img_url} -> {img_name}")
    except Exception as e:
        print(f"下载失败: {img_url}, 错误信息: {e}")

# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
    try:
        print(f"正在爬取详情页: {detail_url}")
        response = requests.get(detail_url, timeout=10)
        if response.status_code != 200:
            print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
            return

        soup = BeautifulSoup(response.content, "html.parser")
        img_tags = soup.select("#content p img")  # 选择详情页中的图片标签

        for img_tag in img_tags:
            img_url = img_tag["src"]
            download_image(img_url, save_dir)
    except Exception as e:
        print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")

# 爬取主页面，获取所有详情页链接
def scrape_images(base_url, save_dir, thread_count):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    page = 1
    threads = []

    while True:
        try:
            # 生成分页 URL
            url = base_url.replace("_1", f"_{page}")
            print(f"正在爬取页面: {url}")
            response = requests.get(url, timeout=10)

            if response.status_code != 200:
                print(f"无法访问页面: {url}, 状态码: {response.status_code}")
                break

            soup = BeautifulSoup(response.content, "html.parser")
            detail_links = soup.select("ul.g-gxlist-imgbox li a")

            if not detail_links:
                print("未找到更多详情页链接，爬取结束。")
                break

            for link in detail_links:
                detail_url = "https://www.qqtn.com" + link["href"]

                # 创建线程爬取详情页
                thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
                threads.append(thread)
                thread.start()

                # 控制线程数量
                while len(threads) >= thread_count:
                    for t in threads:
                        t.join(0.1)
                    threads = [t for t in threads if t.is_alive()]

            page += 1
        except Exception as e:
            print(f"爬取页面失败: {url}, 错误信息: {e}")
            continue

    # 等待所有线程结束
    for t in threads:
        t.join()

# 示例用法
base_url = "https://www.qqtn.com/tx/nvshengtx_1.html"
save_directory = "qqtn_images"
thread_count = 5  # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)

第2个

import requests
from bs4 import BeautifulSoup
import os
import threading

# 下载单张图片并保存到本地
def download_image(img_url, save_dir):
    try:
        img_data = requests.get(img_url, timeout=10).content
        img_name = os.path.join(save_dir, img_url.split("/")[-1])
        with open(img_name, "wb") as f:
            f.write(img_data)
        print(f"下载成功: {img_url} -> {img_name}")
    except Exception as e:
        print(f"下载失败: {img_url}, 错误信息: {e}")

# 爬取详情页中的图片
def scrape_detail_page(detail_url, save_dir):
    try:
        print(f"正在爬取详情页: {detail_url}")
        response = requests.get(detail_url, timeout=10)
        if response.status_code != 200:
            print(f"无法访问详情页: {detail_url}, 状态码: {response.status_code}")
            return

        soup = BeautifulSoup(response.content, "html.parser")
        img_tags = soup.find_all("img", alt=True)

        for img_tag in img_tags:
            img_url = img_tag["src"]
            download_image(img_url, save_dir)
    except Exception as e:
        print(f"爬取详情页失败: {detail_url}, 错误信息: {e}")

# 爬取主页面的图片和详情链接
def scrape_images(base_url, save_dir, thread_count):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    page = 1
    threads = []

    while True:
        try:
            # 生成分页 URL
            url = f"{base_url}index_{page}.html" if page > 1 else base_url
            print(f"正在爬取页面: {url}")
            response = requests.get(url, timeout=10)

            if response.status_code != 200:
                print(f"无法访问页面: {url}, 状态码: {response.status_code}")
                break

            soup = BeautifulSoup(response.content, "html.parser")
            detail_links = soup.select("ul.g-gxlist-imgbox li a")

            if not detail_links:
                print("未找到更多详情页链接，爬取结束。")
                break

            for link in detail_links:
                detail_url = link["href"]

                # 创建线程爬取详情页
                thread = threading.Thread(target=scrape_detail_page, args=(detail_url, save_dir))
                threads.append(thread)
                thread.start()

                # 控制线程数量
                while len(threads) >= thread_count:
                    for t in threads:
                        t.join(0.1)
                    threads = [t for t in threads if t.is_alive()]

            page += 1
        except Exception as e:
            print(f"爬取页面失败: {url}, 错误信息: {e}")
            continue

    # 等待所有线程结束
    for t in threads:
        t.join()

# 示例用法
base_url = "http://www.imeitou.com/nvsheng/mnns/"
save_directory = "images"
thread_count = 5  # 可调整线程数量
scrape_images(base_url, save_directory, thread_count)

效果如图

cself · 2025 年1 月 14 日 02:25

第一占楼

coffox · 2025 年1 月 14 日 02:27

注册地？

XGM · 2025 年1 月 14 日 02:30

希望不要局限于头像

Jerk_H · 2025 年1 月 14 日 02:31

哈哈哈，有趣，佬友评论太内涵了

jiji262 · 2025 年1 月 14 日 02:34

第二个呢

Yuccc · 2025 年1 月 14 日 02:35

倒不至于但是估计沾点黄

coffox · 2025 年1 月 14 日 02:36

很难不浮想联翩

coffox · 2025 年1 月 14 日 02:37

紧跟时事

xin34635 · 2025 年1 月 14 日 02:40

正经公司？

handsome · 2025 年1 月 14 日 02:41

太强了！

Ant · 2025 年1 月 14 日 02:46

这个太有用了，正好要批量做号，可以用起来了

Yuccc · 2025 年1 月 14 日 02:48

应该算正经公司只是其中一个部门是搞海外项目的公司还有做警务系统的

q626 · 2025 年1 月 14 日 02:49

请开始你的表演，发挥一下

Yuccc · 2025 年1 月 14 日 02:49

ghs才是第一生产力是吧

GavenF · 2025 年1 月 14 日 02:51

话说，就我的这个下拉的是这样么，有点强迫症。

Yuccc · 2025 年1 月 14 日 02:58

我是故意缩放起来的，不然看起来排版有点乱

GavenF · 2025 年1 月 14 日 03:02

我的意思是这个缩放箭头，咋是扁的呢

Yuccc · 2025 年1 月 14 日 03:03

我刚发现我这里看起来也确实怪怪的

teacher · 2025 年1 月 14 日 03:05

好像有免费的美女图片API

话题		回复	浏览量
新人报道，分享个写真站搭建吧，图片内容为秀人资源荟萃 NSFW , 美女	64	3067	2025 年1 月 13 日
简介IIROSE‖蔷薇花园搞七捻三	42	741	2025 年1 月 24 日
claude3.5写的网页风格真的在我的心趴上搞七捻三配置优化 , 纯水	38	2094	2024 年12 月 23 日
LINUX DO 元宇宙 🛸 搞七捻三	251	4070	2024 年12 月 5 日
公益图床+随机图片api 资源荟萃	63	2382	2024 年12 月 11 日

【爬虫】批量采集美女头像

相关话题