一个佬帮写的将油管视频字幕保存到notion，我就想怎么改用能实现自动读取BLIBLI字幕保存到notion。如何写一个PYTHON程序自动读取BILIBILI字幕保存到notion

yingnvwuyan · 2024 年8 月 20 日 16:36

需要用到那些API，用GPT4.0写的代码跑不通，在B站找到一个代码希望一起学习一下#
def extract_information(input_file, output_file):
with open(input_file, ‘r’, encoding=‘utf-8’) as file:
file_content = file.read()

# 找到所有的 "utf8" 字段，并去除换行符
information_list = []
start_index = file_content.find('"utf8": "')
while start_index != -1:
    start_index += len('"utf8": "')
    end_index = file_content.find('"', start_index)
    information = file_content[start_index:end_index].replace("\\n", "").replace("\n", "")
    information_list.append(information)
    start_index = file_content.find('"utf8": "', end_index)

# 将处理完的文本保存到输出文件中
with open(output_file, 'w', encoding='utf-8') as file:
    for info in information_list:
        file.write(info)

print("PASS")
print("#####     #####        ######    ######   ")
print("#    ##  ##   ##      ##        ##        ")
print("##  ##  ##     ##    ##        ##         ")
print("## ##   #########      ##       ##        ")
print("##      ##     ##       ##        ##      ")
print("##      ##     ##        ##         ##    ")
print("##      ##     ##         ##      ##      ")
print("##      ##     ##     ####     ###        ")

if name == “main”:
input_file = “Youtube字幕文件.txt” # 输入文件路径
output_file = “output_file_字幕.txt” # 输出文件路径
extract_information(input_file, output_file)

zhubaiwan-oozzxx · 2024 年8 月 20 日 16:47

蹲一个方法。

我只会手动提取，参考这个https://www.cnblogs.com/lwp-nicol/p/18276164

pangbo · 2024 年8 月 20 日 17:00

https://api.bilibili.com/x/player/wbi/v2?aid=XXX&cid=XXX

这个接口，里面找subtitle_url
aid是投稿的av号，cid是单个视频的编号
直接在视频播放页面的html也就是https://www.bilibili.com/video/BVxxxxx返回的html里搜aid和cid就行。

这两个参数是页面加载时写进window.__playinfo__里面的，所以你可以把html里这一段截取出来当成json解析

yingnvwuyan · 2024 年8 月 20 日 17:00

之前有个网友帮我修改了GPT40生成的代码可以自动读取油管的视频保存到notion，后来也实现了哔哩哔哩的自动提取字幕保存到NOTION的功能，但代码没给我，现在主要是不知道用那个PYTHON库，没点头绪，这里这么多库不知道用那个GitHub - SocialSisterYi/bilibili-API-collect: 哔哩哔哩-API收集整理【不断更新中....】

yingnvwuyan · 2024 年8 月 20 日 17:04

能给出python代码嘛

Theigrams · 2024 年8 月 20 日 17:21

第一步，上网搜索可用的代码，比如

这个是 js 代码，然后让 Claude 转成 Python 代码，然后发现要登录。

解决问题很简单，传入 cookie 即可，继续让 Claude 生成 js 代码在浏览器控制台运行，获取 bili_jct 和 dede_user_id

function getBilibiliCookies() {
  const cookies = document.cookie.split(';');
  const requiredCookies = ['SESSDATA', 'bili_jct', 'DedeUserID'];
  const result = {};

  for (let cookie of cookies) {
    const [name, value] = cookie.trim().split('=');
    if (requiredCookies.includes(name)) {
      result[name] = value;
    }
  }

  if (Object.keys(result).length !== requiredCookies.length) {
    console.log('警告：未找到所有必需的cookie。请确保您已登录B站。');
  }

  console.log('请将以下值复制到您的Python代码中：');
  console.log(JSON.stringify(result, null, 2));
  
  return result;
}

// 调用函数
getBilibiliCookies();

然后在 Python 代码中填入你的 bili_jct 和 dede_user_id 即可

import json
import os
import re

import requests


class BilibiliSubtitleDownloader:
    def __init__(self, bili_jct, dede_user_id):
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
        )
        self.session.cookies.update({"bili_jct": bili_jct, "DedeUserID": dede_user_id})

    def get_video_info(self, url):
        response = self.session.get(url)
        html = response.text

        aid_match = re.search(r'"aid":(\d+)', html)
        bvid_match = re.search(r'"bvid":"([^"]+)"', html)
        oid_match = re.search(r'"cid":(\d+)', html)

        if not (aid_match or bvid_match) or not oid_match:
            raise ValueError("无法从页面提取aid/bvid和oid")

        aid = aid_match.group(1) if aid_match else None
        bvid = bvid_match.group(1) if bvid_match else None
        oid = oid_match.group(1)

        return aid, bvid, oid

    def get_subtitle_url(self, aid, bvid, oid):
        api_url = f"https://api.bilibili.com/x/v2/dm/view?{'aid=' + aid if aid else 'bvid=' + bvid}&oid={oid}&type=1"
        print(f"请求的API地址为: {api_url}")
        response = self.session.get(api_url)
        data = response.json()
        # print("API返回的数据为:")
        # print(json.dumps(data, ensure_ascii=False, indent=2))

        if data["code"] != 0:
            if data["code"] == -404:
                raise ValueError("无法读取本视频APP字幕配置")
            raise ValueError(f"API请求失败: {data['message']}")

        subtitle_data = data["data"]["subtitle"]
        if not subtitle_data or not subtitle_data["subtitles"]:
            raise ValueError("该视频没有可用的字幕")

        subtitle_list = subtitle_data["subtitles"]
        subtitle_url = subtitle_list[0]["subtitle_url"]

        return subtitle_url

    def download_subtitle(self, subtitle_url, output_file):
        print(f"请求的subtitle_url为: {subtitle_url}")
        response = self.session.get(subtitle_url)
        subtitle_data = response.json()

        print("字幕数据:")
        # print(json.dumps(subtitle_data, ensure_ascii=False, indent=2))

        with open(output_file, "w", encoding="utf-8") as f:
            for line in subtitle_data["body"]:
                start_time = self.format_time(line["from"])
                end_time = self.format_time(line["to"])
                content = line["content"]
                f.write(f"{start_time} --> {end_time}\n{content}\n\n")

        print(f"字幕已保存到 {output_file}")

    @staticmethod
    def format_time(seconds):
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"


def main():
    # 设置cookie，都为字符串
    bili_jct = "您的bili_jct值"
    dede_user_id = "您的DedeUserID值"
    video_url = "https://www.bilibili.com/video/BV1xw4m1k7Jz/?spm_id_from=333.788.recommend_more_video.0&vd_source=a2a26231f149eff4f1e0d2974449af5c"

    if bili_jct == "您的bili_jct值" or dede_user_id == "您的DedeUserID值":
        print("警告：请在main函数中设置正确的bili_jct和DedeUserID值")
        return

    downloader = BilibiliSubtitleDownloader(bili_jct, dede_user_id)

    try:
        aid, bvid, oid = downloader.get_video_info(video_url)
        print(f"当前视频的aid为{aid}, bvid为{bvid}, oid为{oid}")
        subtitle_url = downloader.get_subtitle_url(aid, bvid, oid)

        output_file = f"subtitle_{aid or bvid}_{oid}.srt"
        downloader.download_subtitle(subtitle_url, output_file)
    except Exception as e:
        print(f"下载失败: {str(e)}")


if __name__ == "__main__":
    main()

evil · 2024 年8 月 20 日 17:31

mark备用，空了把几万条私信消息导出来

Borber · 2024 年8 月 20 日 17:50

好家伙, 几万条私信

sureai · 2024 年8 月 21 日 05:58

我用Python糊了一个，没有notion那一块，只做了简单抓取字幕那一块

yingnvwuyan · 2024 年8 月 22 日 04:43

能分享代码学习嘛

sureai · 2024 年8 月 22 日 04:46

代码在那个帖子里，展开Python那个箭头就可以看到

yingnvwuyan · 2024 年8 月 25 日 12:33

#一个佬帮写的将油管视频字幕保存到notion，我就想怎么改用能实现自动读取BLIBLI字幕保存到notion
import tkinter as tk
from tkinter import messagebox
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
import requests
import json

def get_youtube_subtitles(video_url):
try:
video_id = video_url.split(“v=”)[1]
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
subtitles = " “.join([entry[‘text’] for entry in transcript_list])
with open(‘sub.txt’, ‘w’, encoding=‘utf-8’) as file:
file.write(subtitles)
return subtitles, “Subtitles retrieved successfully”
except (IndexError, KeyError):
return None, “Invalid YouTube URL.”
except NoTranscriptFound:
return None, “No subtitles found for this video.”
except TranscriptsDisabled:
return None, “Subtitles are disabled for this video.”
except Exception as e:
return None, f"Error retrieving subtitles: {str(e)}”

def clean_subtitles(srt_captions):
lines = srt_captions.splitlines()
cleaned_text =
for line in lines:
if not line.isdigit() and ‘–>’ not in line:
cleaned_text.append(line)
return " ".join(cleaned_text)

def save_to_notion(page_id, text, notion_token):
url = f"https://api.notion.com/v1/blocks/{page_id}/children"
headers = {
“Authorization”: f"Bearer {notion_token}",
“Content-Type”: “application/json”,
“Notion-Version”: “2022-06-28”
}

# Split text into chunks of 2000 characters or less
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]

children = []
for chunk in chunks:
    children.append({
        "object": "block",
        "type": "paragraph",
        "paragraph": {
            "rich_text": [{"type": "text", "text": {"content": chunk}}]
        }
    })

data = {"children": children}

response = requests.patch(url, headers=headers, data=json.dumps(data))

if response.status_code == 200:
    return response, "Text saved to Notion successfully."
else:
    return response, f"Failed to save text to Notion: {response.status_code} - {response.text}"

def extract_and_save():
video_url = url_entry.get()
page_id = page_id_entry.get()
notion_token = token_entry.get()

if not video_url or not page_id or not notion_token:
    messagebox.showerror("Error", "Please fill in all fields.")
    return

srt_captions, message = get_youtube_subtitles(video_url)
if srt_captions:
    cleaned_text = clean_subtitles(srt_captions)
    response, error_message = save_to_notion(page_id, cleaned_text, notion_token)
    result_label.config(text=error_message)
else:
    result_label.config(text=message)

创建主窗口

root = tk.Tk()
root.title(“YouTube Subtitle Extractor”)

URL输入

tk.Label(root, text=“YouTube URL:”).grid(row=0, column=0, padx=10, pady=5, sticky=tk.E)
url_entry = tk.Entry(root, width=50)
url_entry.grid(row=0, column=1, padx=10, pady=5)

Notion页面ID输入

tk.Label(root, text=“Notion Page ID:”).grid(row=1, column=0, padx=10, pady=5, sticky=tk.E)
page_id_entry = tk.Entry(root, width=50)
page_id_entry.grid(row=1, column=1, padx=10, pady=5)

Notion集成令牌输入

tk.Label(root, text=“Notion Token:”).grid(row=2, column=0, padx=10, pady=5, sticky=tk.E)
token_entry = tk.Entry(root, width=50, show=“*”)
token_entry.grid(row=2, column=1, padx=10, pady=5)

提取和保存按钮

extract_save_button = tk.Button(root, text=“Extract & Save”, command=extract_and_save)
extract_save_button.grid(row=3, column=0, columnspan=2, pady=10)

结果标签

result_label = tk.Label(root, text=“”, wraplength=400)
result_label.grid(row=4, column=0, columnspan=2, pady=5)

启动主循环

root.mainloop()

cxu · 2024 年8 月 25 日 12:34

有没有免 cookie 的方法? 毕竟只有一个号

yingnvwuyan · 2024 年8 月 25 日 12:42

油管那个不要cookie

CRI4250 · 2024 年8 月 25 日 12:46

我顶技术佬来解答一下
小白想学！

yingnvwuyan · 2024 年8 月 25 日 12:50

以后会一直在这个贴分析关键视频字幕提取的PYTHON程序，请关注

cxu · 2024 年8 月 25 日 12:58

噢我是问有没有 bilibili 下载字幕不要 cookie 的办法

yingnvwuyan · 2024 年8 月 25 日 12:59

目前没有，以后应该会有

CRI4250 · 2024 年8 月 25 日 13:04

等个管理转人工智能
有些佬可能不看常规话题！
毕竟论坛人工智能是热点版块

yingnvwuyan · 2024 年8 月 25 日 13:11

导入数据请求模块

import requests

导入正则表达式模块

import re

导入json模块

import json

导入格式化输出模块

from pprint import pprint
import subprocess

导入哈希模块

import hashlib

导入时间模块

import time

def GetSign(page, date_time):
f = [
“keyword=”,
“mid=3493110839511225”,
“order=pubdate”,
“order_avoided=true”,
“platform=web”,
f"pn={page}“,
“ps=30”,
“tid=0”,
“web_location=1550101”,
f"wts={date_time}”
]
# 合并成字符串
y = ‘&’.join(f)
# 加密参数
string = y + ‘ea1db124af3c7062474693fa704f4ff8’
MD5 = hashlib.md5()
MD5.update(string.encode(‘utf-8’))
w_rid = MD5.hexdigest()
return w_rid

“”“发送请求”“”

模拟浏览器

headers = {
# Cookie 用户信息, 常用于检测是否有登陆账号
# Referer 防盗链, 告诉服务器请求链接从哪里跳转过来
“Referer”:“爱吃番茄炒蛋的小新投稿视频-爱吃番茄炒蛋的小新视频分享-哔哩哔哩视频”,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36”
}
for page in range(2, 21):
print(f’正在采集第{page}页数据内容’)
# 请求网址
link = ‘https://api.bilibili.com/x/space/wbi/arc/search’
# 获取时间戳
date_time = int(time.time())
“”“获取加密参数”“”
w_rid = GetSign(page, date_time)
# 请求参数
data = {
‘mid’: ‘3493110839511225’,
‘ps’: ‘30’,
‘tid’: ‘0’,
‘pn’: page,
‘keyword’: ‘’,
‘order’: ‘pubdate’,
‘platform’: ‘web’,
‘web_location’: ‘1550101’,
‘order_avoided’: ‘true’,
‘w_rid’: w_rid,
‘wts’: date_time,
}
# 发送请求 + 获取json数据
link_json = requests.get(url=link, params=data, headers=headers).json()
# for循环遍历, 提取所有视频bv号
for index in link_json[‘data’][‘list’][‘vlist’]:
bv_id = index[‘bvid’]
# 请求网址
url = f’https://www.bilibili.com/video/{bv_id}/?spm_id_from=333.999.0.0’
# 发送请求
response = requests.get(url=url, headers=headers)
“”“获取数据”“”
# 获取响应的文本数据 (网页源代码)
html = response.text
“”“解析数据”“”
# 提取标题
title = re.findall(‘<h1 data-title="(.?)" title="‘, html)[0]
# 提取视频信息
info = re.findall(’window.playinfo=(.?)</script’, html)[0]
# 把json字符串转成json字典
json_data = json.loads(info)
# 提取音频链接
audio_url = json_data[‘data’][‘dash’][‘audio’][0][‘baseUrl’]
# 提取视频链接
video_url = json_data[‘data’][‘dash’][‘video’][0][‘baseUrl’]
print(title)
print(audio_url)
print(video_url)
“”“保存数据”“”
# 获取音频内容
audio_content = requests.get(url=audio_url, headers=headers).content
# 获取视频内容
video_content = requests.get(url=video_url, headers=headers).content
# 音频数据保存
with open(‘video\’ + title + ‘.mp3’, mode=‘wb’) as audio:
# 写入数据
audio.write(audio_content)
# 视频数据保存
with open(‘video\’ + title + ‘.mp4’, mode=‘wb’) as video:
# 写入数据
video.write(video_content)
“”“合并音视频”“”
# 合成命令
cmd = f"ffmpeg -hide_banner -i video\{title}.mp4 -i video\{title}.mp3 -c:v copy -c:a aac -strict experimental data\{title}output.mp4"
subprocess.run(cmd)

话题		回复	浏览量
求助：private-gpt，读取文档的本地运行大语言模型怎么搞一键安装包？搞七捻三人工智能	0	66	2024 年11 月 6 日
找个好用的字幕翻译软件开发调优快问快答	14	783	2024 年8 月 29 日
为啥始皇的gpt无法让其使用Python呢，试过很多次开发调优 ChatGPT , 快问快答	14	451	2024 年8 月 29 日
现在有什么项目可以得到GPT-4的api 开发调优快问快答	10	628	2024 年8 月 29 日
有没开源的语音生成字幕开发调优快问快答	13	647	2024 年8 月 29 日