1、 使用多协程方式爬取豆果美食https://www.douguo.com/ 精选菜谱中的信息(食物名称、菜单链接、原料、作者、观看量),并将数据存储在csv文件中。
在Python中,我们可以使用concurrent.futures
模块来实现多线程或多进程的协程,同时配合requests
和beautifulsoup4
库来抓取网页信息。以下是一个简单的例子,展示如何使用协程爬取豆果美食的精选菜谱数据并将其存储到CSV文件中:
```python import concurrent.futures import requests from bs4 import BeautifulSoup import csv import time
def fetch_recipe_info(recipe_url): response = requests.get(recipe_url) soup = BeautifulSoup(response.text, 'html.parser')
try:
food_name = soup.find('h1', class_='title').text
ingredients = [i.text for i in soup.select('.ingredients li')]
author = soup.find('a', {'class': 'author'}).text
view_count = int(soup.find('span', {'class': 'view-count'}).text.replace(',', ''))
return {
'food_name': food_name,
'menu_link': recipe_url,
'ingredients': ingredients,
'author': author,
'view_count': view_count
}
except Exception as e:
print(f"Error occurred while fetching info: {e}")
return None
def save_to_csv(data_list, file_name): with open(file_name, 'w', newline='', encoding='utf-8-sig') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=['food_name', 'menu_link', 'ingredients', 'author', 'view_count']) writer.writeheader() writer.writerows(data_list)
base_url = 'https://www.douguo.com/cookbook/' recipes_urls = ['{}recipe/{}/'.format(base_url, page_num) for page_num in range(1, 6)] # 示例:假设有5页
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_url = {executor.submit(fetch_recipe_info, url): url for url in recipes_urls} data_list = []
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
recipe_data = future.result()
if recipe_data is not None:
data_list.append(recipe_data)
except Exception as exc:
print(f'{url} generated an exception: {exc}')
save_to_csv(data_list, 'douguo_recipes.csv')
相关推荐












