��ȡBվ��ۣ�Python��ʵ��

2024-04-30 41

��Ȩ

��Ȩ��

��ɰ��ʵ��ע��û��Է��ף��Ȩ��ԭ��У��ƿ��ӵ��Ȩ��಻�е��Ӧ��Ρ��鿴�� ƿ��û��Э�� ƿ��֪ʶ��Ȩ��ָ��ֱ��ӳ�Ϯ��ݣ��д ��ȨͶ�߱��оٱ��һ��ʵ��ɾ��Ȩ��ݡ�

��漰�Ĳ�Ʒ

ʵʱ�� Flink �棬5000CU*H 3��

�� Elasticsearch �棬2��4GB��߹�� 1��

��ݿ��ƽ̨ DataWorks��ʱ��

��飺 ��ȡBվ��ۣ�Python��ʵ��

��
�ڵ��Ϣ��ը�Ļ��ʱ��û��ɵ��ݲ��ӿ�֣��а��˸��ָ��ۡ��Bվ��Ϊһ��Ƶ��ƽ̨��һ��˸��ָ��۵ı��صء��ô��û��һ�ּ򵥵ķ��Խ��Щ��ռ��أ��ǿ϶��ģ��Ľ��ʹ��Python��дһ��ʵ��ȡBվ��Ƶ��ۣ�Ϊ��̽��ݵİ��ش��
ʲô��棿
�ڿ�ʼ֮ǰ��˽�һ��ʲô��档��棬�ֳ��桢��֩�룬��һ�ְ��һ��Ĺ��Զ��ػ�ȡ��ά��Ϣ�ĳ��ű��˵��ͨ��д��룬�ü��Զ��ش��ҳ��ץȡ��Ҫ��Ϣ��Python��Ϊһ�ּ�ࡢ��ѧ�ı��ԣ��ǳ��ʺ��д��
׼��
�ڿ�ʼ��ȡBվ��֮ǰ��Ҫ��һЩ׼��

Python��ȷ��ĵ��Ѿ��װ��Python��ܹ��С�
�༭��Ƽ�ʹ��VS Code��PyCharm�ȱ༭��дPython��룬��Ժ͹��
��⣺��ǽ�ʹ��requests�ⷢ��HTTP��Լ�beautifulsoup4��HTMLҳ�档��ʹ��װ��⣺
��д��
��һ��ȡ��ҳ��URL
��ȣ��Ҫ�ҵ�Ҫ��ȡ��۵��Ƶҳ�棬��ȡ��ҳ��URL��ͨ��Bվ��Ƶ��ҳ��URL��ʽΪhttps://www.bilibili.com/video/avXXXXXX/#reply��avXXXXXX��Ƶ��av�š��ǿ��ͨ��ƴ��URL�ķ�ʽ��ҳ��URL��
�ڶ��HTTP��ȡҳ��
��ҳ��URL֮��ǾͿ��ʹ��requests�ⷢ��HTTP��󣬻�ȡҳ��HTML��ݡ�
��ʵ��
```import requests
import json
import os
import pickle
from bs4 import BeautifulSoup
import time

��ͷ��Ϣ��αװ��

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

��ô��Ϣ

proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
proxyMeta = f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"

��¼Bվ��ȡ��cookies��Զ��¼

cookies_file = 'cookies.pkl'

��ȵ��ļ��

progress_file = 'progress.txt'

��۵��ļ��

comment_dir = 'comments'

��۵��ļ��

if not os.path.exists(comment_dir):
os.makedirs(comment_dir)

��cookies

if os.path.exists(cookies_file):
with open(cookies_file, 'rb') as f:
cookies = pickle.load(f)
else:
cookies = None

def login():
"""�ֶ��¼Bվ��ȡcookies"""
print("��ֶ��¼Bվ��cookies��cookies.pkl�ļ��С�")

def get_video_id(url):
"""��Ƶ��ַ��ȡ��ƵID"""
return url.split('/')[-1]

def get_comments(video_url):
"""��ȡ��Ƶ��"""
video_id = get_video_id(video_url)
comment_file = os.path.join(comment_dir, f'{video_id}.csv')
if os.path.exists(comment_file):
print(f"��ļ� {comment_file} �Ѵ��ڣ��Ƶ��")
return

# ������Ƶҳ�棬��ȡ���۽ӿ�
response = requests.get(video_url, headers=headers, cookies=cookies, proxies={"http": proxyMeta, "https": proxyMeta})
soup = BeautifulSoup(response.text, 'html.parser')
script = soup.find('script', attrs={'type': 'application/ld+json'})
video_data = json.loads(script.text)
api_url = video_data['comment']['embedUrl']

# ѭ����ȡ���ۣ�ֱ����ȡ����������
page = 1
comments = []
while True:
    api = f'{api_url}&pn={page}&type=1'
    response = requests.get(api, headers=headers, cookies=cookies, proxies={"http": proxyMeta, "https": proxyMeta})
    data = response.json()
    if 'data' in data and data['data']['replies']:
        comments.extend(data['data']['replies'])
        page += 1
        time.sleep(1)  # �����������Ƶ������IP
    else:
        break

# �������۵�CSV�ļ�
with open(comment_file, 'w', encoding='utf-8') as f:
    f.write('һ�����ۼ���,������ϵ,���������ǳ�,��������ID,�������ǳ�,�������û�ID,��������,����ʱ��,������\n')
    for comment in comments:
        content = comment['content']['message']
        content = content.replace('\n', ' ')
        like = comment['like']
        publish_time = comment['ctime']
        f.write(f'1, , , , , ,"{content}",{publish_time},{like}\n')
        if 'replies' in comment:
            for reply in comment['replies']:
                content = reply['content']['message']
                content = content.replace('\n', ' ')
                like = reply['like']
                publish_time = reply['ctime']
                f.write(f'2,{comment["mid"]},{reply["member"]["uname"]},{reply["member"]["mid"]},'
                        f'{reply["member"]["uname"]},{reply["member"]["mid"]},"{content}",{publish_time},{like}\n')
print(f"�ɹ���ȡ��Ƶ {video_id} �����ۣ������� {comment_file} �С�")

def main():

# ��ȡ��Ƶ�б�
with open('video_list.txt', 'r') as f:
    video_urls = f.readlines()

# ������ȡ��Ƶ����
for url in video_urls:
    url = url.strip()
    get_comments(url)

if name == 'main':
if cookies is None:
login()
main()
```
�ܽ�
�� ȡ��Ƶ��ۣ�ֻ�轫Ҫ��ȡ��Ƶ��ַд��video_list.txt�ļ��У��Զ��ַ�б��ȡÿ��Ƶ��ۣ��浽��ƵID��CSV�ļ��С�
�� ֻ��һ�ε�¼��ֶ��¼Bվһ�κ󣬳��Զ��cookies��´��г��ʱ��ٴε�¼��ȷ��ȡ��ݡ�
�� ϵ��֧�ֶϵ��ܣ��ж��棬�´��ʱ��progress.txt�ļ��еĽ��ȼ��ȡ��ۣ��Ѿ�д��һ��CSV�ļ�Ҳ��д�룬��ݶ�ʧ��

��ȡBվ��ۣ�Python��ʵ��

��ͷ��Ϣ��αװ��

��ô��Ϣ

��¼Bվ��ȡ��cookies��Զ��¼

��ȵ��ļ��

��۵��ļ��

��۵��ļ��

��cookies

��ѧϰ

��

��

��ؿγ�

��ص��

��ʵ�鳡��

��ȡBվ���ۣ�Python����ʵ�����

��������ͷ����Ϣ��αװ�����������

���ô�����Ϣ

��¼Bվ���ȡ��cookies�������Զ���¼

������ȵ��ļ���

�������۵��ļ�����

�����������۵��ļ���

����cookies

�����������ѧϰ

��������

��������

��ؿγ�

��ص�����

���ʵ�鳡��

��ȡBվ��ۣ�Python��ʵ��

��ͷ��Ϣ��αװ��

��ô��Ϣ

��¼Bվ��ȡ��cookies��Զ��¼

��ȵ��ļ��

��۵��ļ��

��۵��ļ��

��cookies

��ѧϰ

��

��

��ص��

��ʵ�鳡��