一个佬帮写的将油管视频字幕保存到notion,我就想怎么改用能实现自动读取BLIBLI字幕保存到notion。如何写一个PYTHON程序自动读取BILIBILI字幕保存到notion

需要用到那些API,用GPT4.0写的代码跑不通,在B站找到一个代码希望一起学习一下#
def extract_information(input_file, output_file):
with open(input_file, ‘r’, encoding=‘utf-8’) as file:
file_content = file.read()

# 找到所有的 "utf8" 字段,并去除换行符
information_list = []
start_index = file_content.find('"utf8": "')
while start_index != -1:
    start_index += len('"utf8": "')
    end_index = file_content.find('"', start_index)
    information = file_content[start_index:end_index].replace("\\n", "").replace("\n", "")
    information_list.append(information)
    start_index = file_content.find('"utf8": "', end_index)

# 将处理完的文本保存到输出文件中
with open(output_file, 'w', encoding='utf-8') as file:
    for info in information_list:
        file.write(info)

print("PASS")
print("#####     #####        ######    ######   ")
print("#    ##  ##   ##      ##        ##        ")
print("##  ##  ##     ##    ##        ##         ")
print("## ##   #########      ##       ##        ")
print("##      ##     ##       ##        ##      ")
print("##      ##     ##        ##         ##    ")
print("##      ##     ##         ##      ##      ")
print("##      ##     ##     ####     ###        ")

if name == “main”:
input_file = “Youtube字幕文件.txt” # 输入文件路径
output_file = “output_file_字幕.txt” # 输出文件路径
extract_information(input_file, output_file)

10 个赞

蹲一个方法。

我只会手动提取,参考这个https://www.cnblogs.com/lwp-nicol/p/18276164

2 个赞
https://api.bilibili.com/x/player/wbi/v2?aid=XXX&cid=XXX

这个接口,里面找subtitle_url
aid是投稿的av号,cid是单个视频的编号
直接在视频播放页面的html也就是https://www.bilibili.com/video/BVxxxxx返回的html里搜aid和cid就行。

这两个参数是页面加载时写进window.__playinfo__里面的,所以你可以把html里这一段截取出来当成json解析

3 个赞

之前有个网友帮我修改了GPT40生成的代码可以自动读取油管的视频保存到notion,后来也实现了哔哩哔哩的自动提取字幕保存到NOTION的功能,但代码没给我,现在主要是不知道用那个PYTHON库,没点头绪,这里这么多库不知道用那个GitHub - SocialSisterYi/bilibili-API-collect: 哔哩哔哩-API收集整理【不断更新中....】

2 个赞

能给出python代码嘛

2 个赞

第一步,上网搜索可用的代码,比如

这个是 js 代码,然后让 Claude 转成 Python 代码,然后发现要登录。

解决问题很简单,传入 cookie 即可,继续让 Claude 生成 js 代码在浏览器控制台运行,获取 bili_jctdede_user_id

function getBilibiliCookies() {
  const cookies = document.cookie.split(';');
  const requiredCookies = ['SESSDATA', 'bili_jct', 'DedeUserID'];
  const result = {};

  for (let cookie of cookies) {
    const [name, value] = cookie.trim().split('=');
    if (requiredCookies.includes(name)) {
      result[name] = value;
    }
  }

  if (Object.keys(result).length !== requiredCookies.length) {
    console.log('警告:未找到所有必需的cookie。请确保您已登录B站。');
  }

  console.log('请将以下值复制到您的Python代码中:');
  console.log(JSON.stringify(result, null, 2));
  
  return result;
}

// 调用函数
getBilibiliCookies();

然后在 Python 代码中填入你的 bili_jctdede_user_id 即可

import json
import os
import re

import requests


class BilibiliSubtitleDownloader:
    def __init__(self, bili_jct, dede_user_id):
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
        )
        self.session.cookies.update({"bili_jct": bili_jct, "DedeUserID": dede_user_id})

    def get_video_info(self, url):
        response = self.session.get(url)
        html = response.text

        aid_match = re.search(r'"aid":(\d+)', html)
        bvid_match = re.search(r'"bvid":"([^"]+)"', html)
        oid_match = re.search(r'"cid":(\d+)', html)

        if not (aid_match or bvid_match) or not oid_match:
            raise ValueError("无法从页面提取aid/bvid和oid")

        aid = aid_match.group(1) if aid_match else None
        bvid = bvid_match.group(1) if bvid_match else None
        oid = oid_match.group(1)

        return aid, bvid, oid

    def get_subtitle_url(self, aid, bvid, oid):
        api_url = f"https://api.bilibili.com/x/v2/dm/view?{'aid=' + aid if aid else 'bvid=' + bvid}&oid={oid}&type=1"
        print(f"请求的API地址为: {api_url}")
        response = self.session.get(api_url)
        data = response.json()
        # print("API返回的数据为:")
        # print(json.dumps(data, ensure_ascii=False, indent=2))

        if data["code"] != 0:
            if data["code"] == -404:
                raise ValueError("无法读取本视频APP字幕配置")
            raise ValueError(f"API请求失败: {data['message']}")

        subtitle_data = data["data"]["subtitle"]
        if not subtitle_data or not subtitle_data["subtitles"]:
            raise ValueError("该视频没有可用的字幕")

        subtitle_list = subtitle_data["subtitles"]
        subtitle_url = subtitle_list[0]["subtitle_url"]

        return subtitle_url

    def download_subtitle(self, subtitle_url, output_file):
        print(f"请求的subtitle_url为: {subtitle_url}")
        response = self.session.get(subtitle_url)
        subtitle_data = response.json()

        print("字幕数据:")
        # print(json.dumps(subtitle_data, ensure_ascii=False, indent=2))

        with open(output_file, "w", encoding="utf-8") as f:
            for line in subtitle_data["body"]:
                start_time = self.format_time(line["from"])
                end_time = self.format_time(line["to"])
                content = line["content"]
                f.write(f"{start_time} --> {end_time}\n{content}\n\n")

        print(f"字幕已保存到 {output_file}")

    @staticmethod
    def format_time(seconds):
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"


def main():
    # 设置cookie,都为字符串
    bili_jct = "您的bili_jct值"
    dede_user_id = "您的DedeUserID值"
    video_url = "https://www.bilibili.com/video/BV1xw4m1k7Jz/?spm_id_from=333.788.recommend_more_video.0&vd_source=a2a26231f149eff4f1e0d2974449af5c"

    if bili_jct == "您的bili_jct值" or dede_user_id == "您的DedeUserID值":
        print("警告:请在main函数中设置正确的bili_jct和DedeUserID值")
        return

    downloader = BilibiliSubtitleDownloader(bili_jct, dede_user_id)

    try:
        aid, bvid, oid = downloader.get_video_info(video_url)
        print(f"当前视频的aid为{aid}, bvid为{bvid}, oid为{oid}")
        subtitle_url = downloader.get_subtitle_url(aid, bvid, oid)

        output_file = f"subtitle_{aid or bvid}_{oid}.srt"
        downloader.download_subtitle(subtitle_url, output_file)
    except Exception as e:
        print(f"下载失败: {str(e)}")


if __name__ == "__main__":
    main()
3 个赞

mark备用,空了把几万条私信消息导出来

2 个赞

好家伙, 几万条私信

1 个赞

我用Python糊了一个,没有notion那一块,只做了简单抓取字幕那一块

1 个赞

能分享代码学习嘛

代码在那个帖子里,展开Python那个箭头就可以看到

1 个赞

#一个佬帮写的将油管视频字幕保存到notion,我就想怎么改用能实现自动读取BLIBLI字幕保存到notion
import tkinter as tk
from tkinter import messagebox
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
import requests
import json

def get_youtube_subtitles(video_url):
try:
video_id = video_url.split(“v=”)[1]
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
subtitles = " “.join([entry[‘text’] for entry in transcript_list])
with open(‘sub.txt’, ‘w’, encoding=‘utf-8’) as file:
file.write(subtitles)
return subtitles, “Subtitles retrieved successfully”
except (IndexError, KeyError):
return None, “Invalid YouTube URL.”
except NoTranscriptFound:
return None, “No subtitles found for this video.”
except TranscriptsDisabled:
return None, “Subtitles are disabled for this video.”
except Exception as e:
return None, f"Error retrieving subtitles: {str(e)}”

def clean_subtitles(srt_captions):
lines = srt_captions.splitlines()
cleaned_text =
for line in lines:
if not line.isdigit() and ‘–>’ not in line:
cleaned_text.append(line)
return " ".join(cleaned_text)

def save_to_notion(page_id, text, notion_token):
url = f"https://api.notion.com/v1/blocks/{page_id}/children"
headers = {
“Authorization”: f"Bearer {notion_token}",
“Content-Type”: “application/json”,
“Notion-Version”: “2022-06-28”
}

# Split text into chunks of 2000 characters or less
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]

children = []
for chunk in chunks:
    children.append({
        "object": "block",
        "type": "paragraph",
        "paragraph": {
            "rich_text": [{"type": "text", "text": {"content": chunk}}]
        }
    })

data = {"children": children}

response = requests.patch(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    return response, "Text saved to Notion successfully."
else:
    return response, f"Failed to save text to Notion: {response.status_code} - {response.text}"

def extract_and_save():
video_url = url_entry.get()
page_id = page_id_entry.get()
notion_token = token_entry.get()

if not video_url or not page_id or not notion_token:
    messagebox.showerror("Error", "Please fill in all fields.")
    return

srt_captions, message = get_youtube_subtitles(video_url)
if srt_captions:
    cleaned_text = clean_subtitles(srt_captions)
    response, error_message = save_to_notion(page_id, cleaned_text, notion_token)
    result_label.config(text=error_message)
else:
    result_label.config(text=message)

创建主窗口

root = tk.Tk()
root.title(“YouTube Subtitle Extractor”)

URL输入

tk.Label(root, text=“YouTube URL:”).grid(row=0, column=0, padx=10, pady=5, sticky=tk.E)
url_entry = tk.Entry(root, width=50)
url_entry.grid(row=0, column=1, padx=10, pady=5)

Notion页面ID输入

tk.Label(root, text=“Notion Page ID:”).grid(row=1, column=0, padx=10, pady=5, sticky=tk.E)
page_id_entry = tk.Entry(root, width=50)
page_id_entry.grid(row=1, column=1, padx=10, pady=5)

Notion集成令牌输入

tk.Label(root, text=“Notion Token:”).grid(row=2, column=0, padx=10, pady=5, sticky=tk.E)
token_entry = tk.Entry(root, width=50, show=“*”)
token_entry.grid(row=2, column=1, padx=10, pady=5)

提取和保存按钮

extract_save_button = tk.Button(root, text=“Extract & Save”, command=extract_and_save)
extract_save_button.grid(row=3, column=0, columnspan=2, pady=10)

结果标签

result_label = tk.Label(root, text=“”, wraplength=400)
result_label.grid(row=4, column=0, columnspan=2, pady=5)

启动主循环

root.mainloop()

有没有免 cookie 的方法? 毕竟只有一个号

油管那个不要cookie

1 个赞

我顶 技术佬来解答一下
小白想学!

以后会一直在这个贴分析关键视频字幕提取的PYTHON程序,请关注

噢 我是问有没有 bilibili 下载字幕不要 cookie 的办法

目前没有,以后应该会有

等个管理 转人工智能
有些佬可能不看常规话题!
毕竟论坛人工智能是热点版块

导入数据请求模块

import requests

导入正则表达式模块

import re

导入json模块

import json

导入格式化输出模块

from pprint import pprint
import subprocess

导入哈希模块

import hashlib

导入时间模块

import time

def GetSign(page, date_time):
f = [
“keyword=”,
“mid=3493110839511225”,
“order=pubdate”,
“order_avoided=true”,
“platform=web”,
f"pn={page}“,
“ps=30”,
“tid=0”,
“web_location=1550101”,
f"wts={date_time}”
]
# 合并成字符串
y = ‘&’.join(f)
# 加密参数
string = y + ‘ea1db124af3c7062474693fa704f4ff8’
MD5 = hashlib.md5()
MD5.update(string.encode(‘utf-8’))
w_rid = MD5.hexdigest()
return w_rid

“”“发送请求”“”

模拟浏览器

headers = {
# Cookie 用户信息, 常用于检测是否有登陆账号
# Referer 防盗链, 告诉服务器请求链接从哪里跳转过来
“Referer”:“爱吃番茄炒蛋的小新投稿视频-爱吃番茄炒蛋的小新视频分享-哔哩哔哩视频”,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36”
}
for page in range(2, 21):
print(f’正在采集第{page}页数据内容’)
# 请求网址
link = ‘https://api.bilibili.com/x/space/wbi/arc/search
# 获取时间戳
date_time = int(time.time())
“”“获取加密参数”“”
w_rid = GetSign(page, date_time)
# 请求参数
data = {
‘mid’: ‘3493110839511225’,
‘ps’: ‘30’,
‘tid’: ‘0’,
‘pn’: page,
‘keyword’: ‘’,
‘order’: ‘pubdate’,
‘platform’: ‘web’,
‘web_location’: ‘1550101’,
‘order_avoided’: ‘true’,
‘w_rid’: w_rid,
‘wts’: date_time,
}
# 发送请求 + 获取json数据
link_json = requests.get(url=link, params=data, headers=headers).json()
# for循环遍历, 提取所有视频bv号
for index in link_json[‘data’][‘list’][‘vlist’]:
bv_id = index[‘bvid’]
# 请求网址
url = f’https://www.bilibili.com/video/{bv_id}/?spm_id_from=333.999.0.0
# 发送请求
response = requests.get(url=url, headers=headers)
“”“获取数据”“”
# 获取响应的文本数据 (网页源代码)
html = response.text
“”“解析数据”“”
# 提取标题
title = re.findall(‘<h1 data-title="(.?)" title="‘, html)[0]
# 提取视频信息
info = re.findall(’window.playinfo=(.
?)</script’, html)[0]
# 把json字符串转成json字典
json_data = json.loads(info)
# 提取音频链接
audio_url = json_data[‘data’][‘dash’][‘audio’][0][‘baseUrl’]
# 提取视频链接
video_url = json_data[‘data’][‘dash’][‘video’][0][‘baseUrl’]
print(title)
print(audio_url)
print(video_url)
“”“保存数据”“”
# 获取音频内容
audio_content = requests.get(url=audio_url, headers=headers).content
# 获取视频内容
video_content = requests.get(url=video_url, headers=headers).content
# 音频数据保存
with open(‘video\’ + title + ‘.mp3’, mode=‘wb’) as audio:
# 写入数据
audio.write(audio_content)
# 视频数据保存
with open(‘video\’ + title + ‘.mp4’, mode=‘wb’) as video:
# 写入数据
video.write(video_content)
“”“合并音视频”“”
# 合成命令
cmd = f"ffmpeg -hide_banner -i video\{title}.mp4 -i video\{title}.mp3 -c:v copy -c:a aac -strict experimental data\{title}output.mp4"
subprocess.run(cmd)