代码如下:(如何简单修改可以爬取大量数据,求大佬教学,谢谢)
import os
import requests
import csv
import time
import random
from bs4 import BeautifulSoup
from urllib.parse import quote
模拟的User-Agent列表
USER_AGENTS = [
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0’,
‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0’,
]
生成随机请求头
def generate_random_headers():
return {
‘host’: ‘www.lagou.com’,
‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7’,
‘user-agent’: random.choice(USER_AGENTS), # 随机选择一个User-Agent
}
定义拆分职位和地区的函数
def split_position(position_full):
delimiters = [‘(’, ‘[’, ‘,’, ’ ']
for delimiter in delimiters:
if delimiter in position_full:
parts = position_full.split(delimiter, 1)
position = parts[0].strip()
location = delimiter + parts[1].strip() # 在地区信息前加上分隔符
return [position, location]
return [position_full, ‘未知’]
定义拆分薪资的函数
def split_salary(salary_full):
if ‘-’ in salary_full:
min_salary, max_salary = salary_full.split(‘-’, 1)
min_salary = min_salary.strip()
max_salary = max_salary.strip()
else:
min_salary = max_salary = salary_full.strip()
return {
‘薪资范围’: salary_full, # 保留完整的薪资范围
‘最低工资’: min_salary,
‘最高工资’: max_salary
}
定义拆分经验和学历的函数
def split_experience_education(exp_edu_full):
if ‘/’ in exp_edu_full:
experience, education = exp_edu_full.split(‘/’, 1)
return experience.strip(), education.strip()
return exp_edu_full.strip(), ‘未知’
URL模板,pn参数将通过循环改变
url_template = “https://www.lagou.com/wn/zhaopin?kd={}&pn={}”
初始化存储所有页面数据的列表
all_job_data =
定义要抓取的页数范围,例如从第一页到第五页
for page_num in range(1, 100): # 获取每个关键词的前~页数据
for keyword in [‘Java’, ‘python’, ‘PHP’, ‘产品经理’, ‘销售顾问’, ‘会计’, ‘Delphi’, ‘区块链’]:
encoded_keyword = quote(keyword) # 编码关键词
url = url_template.format(encoded_keyword, page_num)
print(f"Fetching page {page_num} for {keyword} from {url}“)
try:
# 发送请求获取网页内容,使用随机生成的请求头
headers = generate_random_headers()
response = requests.get(url, headers=headers, timeout=(0.5,1), allow_redirects=False)
print(f"Status code: {response.status_code}”)
print(f"Response headers: {response.headers}")
if response.status_code != 200:
print(f"Failed to retrieve page {page_num} for {keyword}. Status code: {response.status_code}")
continue
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
positions = soup.select('.p-top__1F7CL a')
job_infos = soup.select('.p-bom__JlNur')
company_names = soup.select('.company-name__2-SjF a')
industries = soup.select('.industry__1HBkr')
logo_urls = soup.select('.com-logo__1QOwC img')
additional_infos = soup.select('.item-bom__cTJhu .il__3lk85')
# 打印提取状态
print(f"提取到 {len(positions)} 个职位信息")
print(f"提取到 {len(job_infos)} 个职位的薪资和经验学历信息{'等'}")
# print(f"提取到 {len(company_names)} 个公司名称")
# print(f"提取到 {len(industries)} 个行业信息")
# print(f"提取到 {len(logo_urls)} 个公司Logo")
# print(f"提取到 {len(additional_infos)} 个附加信息")
# 遍历每一个招聘信息块,提取信息
for i in range(len(positions)):
try:
# 获取职位和地区信息
position_full = positions[i].text.strip() if i < len(positions) else ''
position_parts = split_position(position_full)
position = position_parts[0].strip() if len(position_parts) > 0 else position_full
location = position_parts[1].strip() if len(position_parts) > 1 else '未知'
company_name = company_names[i].text.strip() if i < len(company_names) else ''
industry = industries[i].text.strip() if i < len(industries) else ''
logo_url = logo_urls[i]['src'].strip() if i < len(logo_urls) else ''
additional_info = additional_infos[i].text.strip() if i < len(additional_infos) else ''
# 获取包含薪资和经验学历的完整信息块,并拆分
job_info = job_infos[i].text.strip() if i < len(job_infos) else ''
salary_info = split_salary(job_info.split('经验')[0].strip()) if '经验' in job_info else {
'薪资范围': job_info, '最低工资': '', '最高工资': ''}
salary = job_info.split('经验')[0].strip() if '经验' in job_info else job_info # 提取薪资部分
experience_education = '经验' + job_info.split('经验')[
1].strip() if '经验' in job_info else '' # 提取经验和学历部分
# 拆分经验和学历
experience, education = split_experience_education(experience_education)
all_job_data.append({
'职位': position,
'地区': location, # 新增列
'薪资范围': salary_info['薪资范围'], # 新增列,保留完整的薪资范围
'最低工资': salary_info['最低工资'], # 新增列
'最高工资': salary_info['最高工资'], # 新增列
'经验': experience, # 分开的经验部分
'学历': education, # 分开的学历部分
'公司名': company_name,
'行业': industry,
'公司 Logo URL': logo_url,
'附加信息': additional_info
})
except Exception as e:
print(f"Error processing item {i} on page {page_num} for {keyword}: {e}")
# 增加请求间隔,避免对服务器造成过大压力
time.sleep(5) # 将请求间隔时间增加到5秒
except Exception as e:
print(f"Error fetching data for {keyword} on page {page_num}: {e}")
将数据写入 CSV 文件
csv_file = ‘data11.csv’
with open(csv_file, ‘w’, newline=‘’, encoding=‘utf-8’) as csvfile:
fieldnames = [‘职位’, ‘地区’, ‘薪资范围’, ‘最低工资’, ‘最高工资’, ‘经验’, ‘学历’, ‘公司名’, ‘行业’, ‘公司 Logo URL’,
‘附加信息’]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for job in all_job_data:
writer.writerow(job)
full_path = os.path.join(os.getcwd(), csv_file) # 获取文件的完整路径
print(f"所有页的数据已保存到 {csv_file}")
print(full_path)