使用webcollector 编写一个爬取https://www.xiangha.com/caipu/95247360.html正文内容的代码程序
时间: 2023-08-22 20:08:40 浏览: 151
WebCollector爬虫、网页正文提取
5星 · 资源好评率100%
以下是使用WebCollector编写的爬取https://www.xiangha.com/caipu/95247360.html正文内容的代码程序:
```
import cn.edu.hfut.dmic.webcollector.model.Page
import cn.edu.hfut.dmic.webcollector.net.HttpRequest
import cn.edu.hfut.dmic.webcollector.net.HttpResponse
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
class XianghaCrawler(crawlPath: String, autoParse: Boolean)
extends BreadthCrawler(crawlPath, autoParse) {
addSeed("https://www.xiangha.com/caipu/95247360.html")
override def visit(page: Page, next: Array[HttpRequest]): Unit = {
val url = page.getUrl
if (page.getResponseCode == HttpResponse.STATUS_OK) {
val doc: Document = Jsoup.parse(page.getHtml)
val title = doc.select("h1.title").text()
val ingredients = doc.select("div.ings").text()
val steps = doc.select("div.steps > ol > li").map(_.text()).mkString("\n")
println(s"$title\n$ingredients\n$steps")
}
}
}
object XianghaCrawlerMain {
def main(args: Array[String]): Unit = {
val crawlPath = "xiangha_crawler"
val xianghaCrawler = new XianghaCrawler(crawlPath, true)
xianghaCrawler.start(2)
}
}
```
代码中使用WebCollector爬虫框架,实现了对目标页面的爬取。在visit方法中,使用Jsoup解析HTML文档,并提取出正文内容,最终输出到控制台。可以根据需要将输出改为保存到文件中。
阅读全文