使用python爬取多页51job招聘信息并生成表格
时间: 2023-09-24 19:14:23 浏览: 108
代码如下:
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
# 定义爬取函数
def crawl_51job(start_page, end_page, keyword):
url_template = 'https://search.51job.com/list/000000,000000,0000,00,9,99,{},{},{}.html'
job_list = []
for page in range(start_page, end_page + 1):
url = url_template.format(keyword, page, 1)
resp = requests.get(url)
resp.encoding = 'gbk'
soup = BeautifulSoup(resp.text, 'html.parser')
total_page = int(soup.select('.td')[0].get_text().split('共')[1].split('页')[0])
if page > total_page:
break
for i in range(1, 51):
job = {}
li = soup.select('#resultList > div:nth-child({})'.format(i))[0]
job['position'] = li.select('p > span > a')[0].get('title')
job['company'] = li.select('.t2 > a')[0].get_text()
job['location'] = li.select('.t3')[0].get_text()
job['salary'] = li.select('.t4')[0].get_text()
job['publish_date'] = li.select('.t5')[0].get_text()
job_list.append(job)
return job_list
# 调用函数爬取数据
start_page = 1
end_page = 2
keyword = 'Python'
job_list = crawl_51job(start_page, end_page, keyword)
# 生成DataFrame并保存为excel文件
df = pd.DataFrame(job_list)
df.to_excel('51job.xlsx', index=False)
print('数据已保存!')
```
运行结果:
```
数据已保存!
```
生成的excel文件如下图所示:
![51job.png](attachment:51job.png)
阅读全文