如何添加请求表头requests.get
时间: 2024-04-24 14:25:28 浏览: 10
要添加请求头(headers)到`requests.get`方法中,你可以使用`headers`参数。这里是一个示例代码:
```python
import requests
url = 'https://example.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
```
在上面的代码中,`headers`字典包含了请求头的键值对。你可以根据自己的需求添加或修改这些键值对。在这个示例中,我们添加了一个`User-Agent`头,它模拟了一个常见的浏览器用户代理,以便服务器能够正确地处理请求。
注意:上述示例中的URL和User-Agent只是示例,请根据你的实际需求进行修改。
相关问题
优化代码:import requests from bs4 import BeautifulSoup import csv # 请求URL url = "https://pvp.qq.com/web201605/herodetail/527.shtml" # 请求头部信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"} # 发送请求 response = requests.get(url, headers=headers) # 解析HTML soup = BeautifulSoup(response.content, "html.parser") # 获取所有英雄的链接 hero_links = [] for hero in soup.select(".herolist > li > a"): hero_links.append(hero["href"]) # 爬取每个英雄的属性 heroes = [] for link in hero_links: response = requests.get(link, headers=headers) soup = BeautifulSoup(response.content, "html.parser") # 获取英雄属性 name = soup.select(".cover-name")[0].text survive = soup.select(".")[0].text attack = soup.select(".cover-list-bar data-bar2 fl")[0].text skill = soup.select(".skill")[0].text difficulty = soup.select(".difficulty")[0].text # 保存英雄属性 heroes.append({"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}) # 将数据写入CSV文件 with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile: fieldnames = ["name", "survive", "attack", "skill", "difficulty"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for hero in heroes: writer.writerow(hero)
可以尝试使用多线程或异步的方式来提高爬取效率,比如使用`concurrent.futures`库中的`ThreadPoolExecutor`来实现多线程爬取。同时,可以对代码进行优化,减少不必要的HTTP请求,比如将`hero_links`列表中的链接去重,避免重复爬取同一个英雄的属性。另外,可以将英雄属性的选择器封装成一个函数,提高代码的可读性和可维护性。优化后的代码如下:
```python
import csv
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
# 请求头部信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57"
}
# 获取英雄属性
def get_hero_attrs(link):
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
name = soup.select(".cover-name")[0].text
survive = soup.select(".")[0].text
attack = soup.select(".cover-list-bar data-bar2 fl")[0].text
skill = soup.select(".skill")[0].text
difficulty = soup.select(".difficulty")[0].text
return {"name": name, "survive": survive, "attack": attack, "skill": skill, "difficulty": difficulty}
# 请求URL
url = "https://pvp.qq.com/web201605/herodetail/527.shtml"
# 发送请求
response = requests.get(url, headers=headers)
# 解析HTML
soup = BeautifulSoup(response.content, "html.parser")
# 获取所有英雄的链接,并去重
hero_links = set(hero["href"] for hero in soup.select(".herolist > li > a"))
# 爬取每个英雄的属性
heroes = []
with ThreadPoolExecutor(max_workers=8) as executor:
hero_attrs = executor.map(get_hero_attrs, hero_links)
for attr in hero_attrs:
heroes.append(attr)
# 将数据写入CSV文件
with open("heroes.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
fieldnames = ["name", "survive", "attack", "skill", "difficulty"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
writer.writeheader()
# 写入数据
for hero in heroes:
writer.writerow(hero)
```
import requests from bs4 import BeautifulSoup import openpyxl from time import sleep # 设置请求头,模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'} # 从天眼查获取公司邮箱和电话 def get_info(company): email = '' phone = '' url = 'https://www.tianyancha.com/search?key=' + company r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') # try: # 获取公司详情页链接 company_url = soup.find_all('a', class_='index_alink__zcia5 link-click')[0].get('href') r = requests.get(company_url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') # 获取公司邮箱和电话 email = soup.find_all('span', class_='index_detail-email__B_1Tq')[0].text sleep(0.5) phone = soup.find('span',class_='index_detail-tel__fgpsE').text # except: # pass # return email,phone # 从Excel文件中读取公司名称 def read_from_excel(file_path): wb = openpyxl.load_workbook(file_path) ws = wb.active company_list = [] for row in ws.iter_rows(min_row=2, values_only=True): company_list.append(row[0]) return company_list # 将公司邮箱和电话写入Excel文件 def write_to_excel(company_list): wb = openpyxl.Workbook() ws = wb.active ws.title = 'Company Info' # 写入表头 ws.cell(row=1, column=1, value='Company Name') ws.cell(row=1, column=2, value='Email') ws.cell(row=1, column=3, value='Phone') # 写入数据 for i, company in enumerate(company_list): email,phone = get_info(company) ws.cell(row=i+2, column=1, value=company) ws.cell(row=i+2, column=2, value=email) ws.cell(row=i+2, column=3, value=phone) # 保存Excel文件 wb.save('company_info.xlsx') if name == 'main': file_path = 'company_names.xlsx' company_list = read_from_excel(file_path) write_to_excel(company_list) 此段代码定位天眼查的公司电话失败,帮助我重新写一遍代码,能够正确定位天眼查公司电话邮箱
好的,以下是重新写的代码,其中对定位电话号码的部分进行了修改和完善,同时也加入了异常处理:
```
import requests
from bs4 import BeautifulSoup
import openpyxl
from time import sleep
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 从天眼查获取公司邮箱和电话
def get_info(company):
email = ''
phone = ''
url = 'https://www.tianyancha.com/search?key=' + company
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
try:
# 获取公司详情页链接
company_url = soup.find_all('a', class_='name')[0].get('href')
r = requests.get(company_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
# 获取公司邮箱
email_soup = soup.find('span', class_='email')
if email_soup:
email = email_soup.get('data-email')
# 获取公司电话
phone_soup = soup.select_one('.detail.web-show span[ng-if="baseInfoModel.phonenumber"]')
if phone_soup:
phone = phone_soup.text.strip()
except:
pass
return email, phone
# 从Excel文件中读取公司名称
def read_from_excel(file_path):
wb = openpyxl.load_workbook(file_path)
ws = wb.active
company_list = []
for row in ws.iter_rows(min_row=2, values_only=True):
company_list.append(row[0])
return company_list
# 将公司邮箱和电话写入Excel文件
def write_to_excel(company_list):
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Company Info'
# 写入表头
ws.cell(row=1, column=1, value='Company Name')
ws.cell(row=1, column=2, value='Email')
ws.cell(row=1, column=3, value='Phone')
# 写入数据
for i, company in enumerate(company_list):
email, phone = get_info(company)
ws.cell(row=i+2, column=1, value=company)
ws.cell(row=i+2, column=2, value=email)
ws.cell(row=i+2, column=3, value=phone)
# 等待0.5秒,避免请求过于频繁
sleep(0.5)
# 保存Excel文件
wb.save('company_info.xlsx')
if __name__ == '__main__':
file_path = 'company_names.xlsx'
company_list = read_from_excel(file_path)
write_to_excel(company_list)
```
希望对你有帮助!