java爬虫获取静态网页和css样式,js
时间: 2024-02-23 10:57:11 浏览: 139
20031_教育网站HTML页面模板.zip
获取静态网页、CSS样式和JS文件的方式和获取静态网页的方式类似,只需要在获取网页内容的基础上再获取CSS和JS文件的内容即可。下面是一个使用Java爬虫获取静态网页和其CSS和JS文件的示例代码:
```java
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StaticPageCrawler {
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
public static void main(String[] args) {
String url = "http://www.example.com";
String pageContent = getPageContent(url);
System.out.println(pageContent);
}
public static String getPageContent(String url) {
StringBuilder pageContentBuilder = new StringBuilder();
try {
URLConnection connection = new URL(url).openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
pageContentBuilder.append(inputLine);
}
in.close();
String pageContent = pageContentBuilder.toString();
String cssContent = getCSSContent(pageContent, url);
String jsContent = getJSContent(pageContent, url);
return pageContent + cssContent + jsContent;
} catch (Exception e) {
e.printStackTrace();
}
return pageContentBuilder.toString();
}
public static String getCSSContent(String pageContent, String url) {
StringBuilder cssContentBuilder = new StringBuilder();
Pattern pattern = Pattern.compile("<link\\s+.*?href=\"(.*?)\".*?>");
Matcher matcher = pattern.matcher(pageContent);
while (matcher.find()) {
String cssUrl = matcher.group(1);
if (cssUrl.startsWith("//")) {
cssUrl = "http:" + cssUrl;
} else if (cssUrl.startsWith("/")) {
cssUrl = url + cssUrl;
}
try {
URL cssUrlObj = new URL(cssUrl);
URLConnection cssConnection = cssUrlObj.openConnection();
cssConnection.setRequestProperty("User-Agent", USER_AGENT);
BufferedReader cssReader = new BufferedReader(new InputStreamReader(cssConnection.getInputStream()));
String cssInputLine;
while ((cssInputLine = cssReader.readLine()) != null) {
cssContentBuilder.append(cssInputLine);
}
cssReader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return cssContentBuilder.toString();
}
public static String getJSContent(String pageContent, String url) {
StringBuilder jsContentBuilder = new StringBuilder();
Pattern pattern = Pattern.compile("<script\\s+.*?src=\"(.*?)\".*?>");
Matcher matcher = pattern.matcher(pageContent);
while (matcher.find()) {
String jsUrl = matcher.group(1);
if (jsUrl.startsWith("//")) {
jsUrl = "http:" + jsUrl;
} else if (jsUrl.startsWith("/")) {
jsUrl = url + jsUrl;
}
try {
URL jsUrlObj = new URL(jsUrl);
URLConnection jsConnection = jsUrlObj.openConnection();
jsConnection.setRequestProperty("User-Agent", USER_AGENT);
BufferedReader jsReader = new BufferedReader(new InputStreamReader(jsConnection.getInputStream()));
String jsInputLine;
while ((jsInputLine = jsReader.readLine()) != null) {
jsContentBuilder.append(jsInputLine);
}
jsReader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return jsContentBuilder.toString();
}
}
```
这个示例代码使用了 Java 的正则表达式来提取网页中的CSS和JS文件的URL,并使用了Java的URL和URLConnection类来获取CSS和JS文件的内容。在这个示例中,我们将获取的静态网页内容、CSS内容和JS文件内容都拼接到了一起并返回。
阅读全文