帮我写个java的爬虫示例代码
时间: 2023-06-06 08:08:08 浏览: 50
当然,我非常乐意帮您编写 Java 的爬虫示例代码。以下是一个简单的爬虫示例代码:
```
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SpiderDemo {
public static void main(String[] args) {
try {
Document doc = Jsoup.connect("http://www.example.com").get();
Elements links = doc.select("a[href]");
for (Element link : links) {
System.out.println(link.attr("abs:href"));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
```
这个爬虫程序使用了 Jsoup 库来解析 HTML,它只获取了指定网站的所有链接并输出到控制台。当然,您可以根据自己的需要,对代码进行修改和优化。希望这能帮到您。
相关问题
帮我用java写个爬虫脚本
你好,我是 C 知道,我可以回答您的问题。以下是一个简单的 Java 爬虫脚本示例:
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class MyCrawler {
public static void main(String[] args) {
String url = "https://www.example.com";
try {
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
for (Element link : links) {
System.out.println(link.attr("abs:href"));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
这个脚本使用 Jsoup 库来解析 HTML 页面,并提取页面中的所有链接。您可以根据需要修改代码来实现您的爬虫功能。希望对您有所帮助!
帮我用java写一个智能爬虫
当然可以!
下面是一个使用 Java 写的简单智能爬虫的示例代码:
```java
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Crawler {
private static final int MAX_PAGES_TO_SEARCH = 10;
private List<String> pagesVisited = new LinkedList<String>();
private List<String> pagesToVisit = new LinkedList<String>();
/**
* This method returns the number of URLs in pagesToVisit that have not yet been visited.
*/
public int getNumberOfUrlsToVisit() {
return this.pagesToVisit.size();
}
/**
* This method is in charge of managing which page to visit next.
*/
public String nextUrl() {
String nextUrl;
do {
nextUrl = this.pagesToVisit.remove(0);
} while(this.pagesVisited.contains(nextUrl));
this.pagesVisited.add(nextUrl);
return nextUrl;
}
/**
* This method searches for links on the current page.
*/
public void search(String url, String searchWord) {
while(this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
String currentUrl;
Crawler leg = new Crawler();
if(this.pagesToVisit.isEmpty()) {
currentUrl = url;
this.pagesVisited.add(url);
} else {
currentUrl = this.nextUrl();
}
leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
// detail to see what it does.
boolean success = leg.searchForWord(searchWord);
if(success) {
System.out.println(String.format("**Success** Word %s found at %s", searchWord, currentUrl));
break;
}
this.pagesToVisit.addAll(leg.getLinks());
}
System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
}
/**
* This method visits a URL, retrieves its HTML content, and parses it to find all the links on the page.
*/
public void crawl(String url) {
try {
Document doc = Jsoup.connect(url).get();
Elements questions = doc.select("a[href]");
for(Element link : questions) {
this.pagesToVisit.add(link.absUrl("href"));
}
} catch (IOException e) {
System.out.println("Error while trying to retrieve the URL: " + e.getMessage());
}
}