import requests from bs4 import BeautifulSoup as BS import time,random url = 'http://www.biqu5200.net/2_2598' # header = {'User-Agent':'Mozilla/5.0(Windows NT 10.0;Win64; ' # 'x64;rv:109.0)Gecko/20200101 Firefox/113.0'} head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36"} html = requests.get(url,headers=head) # # html.encoding = 'utf-8' # print(html.status_code) mybs = BS(html.text,'html.parser') # print(mybs.text) # print(mybs.tytle.string) mylinklist = mybs.find_all('a') # print(mylinklist) urllist = [] for link in mylinklist: url1 = link.get('href') print(url1) try: if url1[0:7] =='/2_2598': urllist.append('http://www.biqu5200.net/'+url1) except: pass #print(urllist) with open("D://789.txt",'w',encoding='utf-8') as file: for url2 in urllist[9:35]: html_zj = requests.get(url2,headers=head) mybs_zj = BS(html_zj.text,'html.parser') print(mybs_zj.h1.string) file.write(mybs_zj.h1.string+'\n') con = mybs_zj.find('div',id='content').text file.write(con+'\n') time.sleep(random.randint(1,5)/10) print("下载完毕!")
时间: 2024-04-03 09:30:43 浏览: 99
import sys import os import urllib from bs4 import BeautifulSoup
好的,这是一段简单的Python网络爬虫代码,下面是逐行解释:
1. `import requests from bs4 import BeautifulSoup as BS import time,random`:导入所需的库,包括 `requests` 用于网络请求,`BeautifulSoup` 用于解析 HTML,`time` 和 `random` 用于在爬虫过程中设置延迟。
2. `url = 'http://www.biqu5200.net/2_2598'`:需要爬取的网站链接。
3. `head = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36"}`:设置请求头,模拟浏览器发送请求。
4. `html = requests.get(url,headers=head)`:发送请求,获取网页内容。
5. `mybs = BS(html.text,'html.parser')`:使用 BeautifulSoup 解析 HTML。
6. `mylinklist = mybs.find_all('a')`:查找 HTML 中所有的链接。
7. `urllist = []`:创建一个空列表,用于保存需要爬取的链接。
8. `for link in mylinklist: url1 = link.get('href') print(url1)`:遍历所有链接,并打印输出。
9. `if url1[0:7] =='/2_2598': urllist.append('http://www.biqu5200.net/'+url1)`:判断链接是否为需要爬取的链接,并将其添加到 `urllist` 列表中。
10. `with open("D://789.txt",'w',encoding='utf-8') as file: for url2 in urllist[9:35]: html_zj = requests.get(url2,headers=head) mybs_zj = BS(html_zj.text,'html.parser') print(mybs_zj.h1.string) file.write(mybs_zj.h1.string+'\n') con = mybs_zj.find('div',id='content').text file.write(con+'\n') time.sleep(random.randint(1,5)/10) print("下载完毕!")`:遍历 `urllist` 列表中的链接,分别请求并解析页面内容,获取文章标题和正文内容,并将其写入到文件中。在每次请求完成之后,设置随机延迟,避免对目标服务器造成过大的压力。最后输出“下载完毕!”。
阅读全文