Java实现对Baidu新闻网站(http://news.baidu.com/)新闻列表信息的采集功能; 2) 将采集结果保存到MySQL数据库中。3新闻采集信息至少包含:新闻类别|新闻标题|采集时间等信息。扩展采集信息(可选)包括:新闻来源|发布时间信息; 3) 要求具有基于新闻标题的新闻查重功能(相同标题的新闻只保存一条)。不使用Jsoup解析库
时间: 2024-01-24 10:19:54 浏览: 201
实现对Baidu新闻网站新闻列表信息的采集功能,可以使用Java自带的URLConnection类进行网络请求和获取页面内容,然后使用正则表达式或字符串操作等方式提取所需要的信息。以下是一个简单的示例代码:
```java
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BaiduNewsCrawler {
public static void main(String[] args) {
String url = "http://news.baidu.com/";
String content = sendGet(url);
List<String> newsList = extractNewsList(content);
for (String news : newsList) {
System.out.println(news);
}
}
private static String sendGet(String url) {
String result = "";
try {
URLConnection connection = new URL(url).openConnection();
connection.connect();
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line + "\n";
}
in.close();
} catch (Exception e) {
System.out.println("Error sending GET request: " + e.getMessage());
}
return result;
}
private static List<String> extractNewsList(String content) {
List<String> newsList = new ArrayList<>();
String pattern = "<a href=\"(.*?)\".*?>(.*?)</a>";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(content);
while (m.find()) {
String link = m.group(1);
String title = m.group(2);
if (link.startsWith("http") && !title.isEmpty()) {
newsList.add(title + " " + link);
}
}
return newsList;
}
}
```
上述代码使用了URLConnection发送GET请求获取页面内容,并使用正则表达式提取新闻列表信息。其中,extractNewsList方法返回一个包含新闻标题和链接的列表。
对于将采集结果保存到MySQL数据库中,我们可以使用JDBC连接数据库并执行SQL语句。以下是一个简单的示例代码:
```java
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BaiduNewsCrawler {
private static final String JDBC_URL = "jdbc:mysql://localhost:3306/news_db";
private static final String JDBC_USER = "root";
private static final String JDBC_PASSWORD = "password";
public static void main(String[] args) {
String url = "http://news.baidu.com/";
String content = sendGet(url);
List<News> newsList = extractNewsList(content);
saveToDatabase(newsList);
List<News> duplicateNewsList = findDuplicateNews();
System.out.println("Duplicate news:");
for (News news : duplicateNewsList) {
System.out.println(news.title);
}
}
private static String sendGet(String url) {
String result = "";
try {
Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASSWORD);
Class.forName("com.mysql.jdbc.Driver");
String sql = "SELECT * FROM news WHERE url = ?";
PreparedStatement pstmt = conn.prepareStatement(sql);
pstmt.setString(1, url);
ResultSet rs = pstmt.executeQuery();
if (rs.next()) {
System.out.println("News already exists in database.");
return rs.getString("content");
}
} catch (Exception e) {
System.out.println("Error checking database: " + e.getMessage());
}
try {
URLConnection connection = new URL(url).openConnection();
connection.connect();
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));
String line;
while ((line = in.readLine()) != null) {
result += line + "\n";
}
in.close();
} catch (Exception e) {
System.out.println("Error sending GET request: " + e.getMessage());
}
try {
Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASSWORD);
Class.forName("com.mysql.jdbc.Driver");
String sql = "INSERT INTO news (url, content) VALUES (?, ?)";
PreparedStatement pstmt = conn.prepareStatement(sql);
pstmt.setString(1, url);
pstmt.setString(2, result);
pstmt.executeUpdate();
} catch (Exception e) {
System.out.println("Error saving to database: " + e.getMessage());
}
return result;
}
private static List<News> extractNewsList(String content) {
List<News> newsList = new ArrayList<>();
String pattern = "<a href=\"(.*?)\".*?>(.*?)</a>";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(content);
while (m.find()) {
String link = m.group(1);
String title = m.group(2);
if (link.startsWith("http") && !title.isEmpty()) {
newsList.add(new News(title, link));
}
}
return newsList;
}
private static void saveToDatabase(List<News> newsList) {
try {
Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASSWORD);
Class.forName("com.mysql.jdbc.Driver");
String sql = "INSERT INTO news (category, title, url, datetime) VALUES (?, ?, ?, ?)";
PreparedStatement pstmt = conn.prepareStatement(sql);
for (News news : newsList) {
pstmt.setString(1, news.category);
pstmt.setString(2, news.title);
pstmt.setString(3, news.url);
pstmt.setLong(4, news.datetime);
pstmt.executeUpdate();
}
} catch (Exception e) {
System.out.println("Error saving to database: " + e.getMessage());
}
}
private static List<News> findDuplicateNews() {
List<News> duplicateNewsList = new ArrayList<>();
try {
Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASSWORD);
Class.forName("com.mysql.jdbc.Driver");
String sql = "SELECT title, COUNT(*) FROM news GROUP BY title HAVING COUNT(*) > 1";
PreparedStatement pstmt = conn.prepareStatement(sql);
ResultSet rs = pstmt.executeQuery();
while (rs.next()) {
String title = rs.getString("title");
int count = rs.getInt(2);
System.out.println(title + " (" + count + " duplicates)");
sql = "SELECT * FROM news WHERE title = ? ORDER BY datetime DESC LIMIT " + (count - 1);
pstmt = conn.prepareStatement(sql);
pstmt.setString(1, title);
ResultSet rs2 = pstmt.executeQuery();
while (rs2.next()) {
duplicateNewsList.add(new News(rs2.getString("category"), title, rs2.getString("url"), rs2.getLong("datetime")));
}
}
} catch (Exception e) {
System.out.println("Error finding duplicate news: " + e.getMessage());
}
return duplicateNewsList;
}
private static class News {
public String category;
public String title;
public String url;
public long datetime;
public News(String title, String url) {
this.title = title;
this.url = url;
this.category = "";
this.datetime = System.currentTimeMillis();
}
public News(String category, String title, String url, long datetime) {
this.category = category;
this.title = title;
this.url = url;
this.datetime = datetime;
}
}
}
```
上述代码使用了JDBC连接MySQL数据库并执行SQL语句。其中,saveToDatabase方法将采集到的新闻保存到数据库中;findDuplicateNews方法查找数据库中重复的新闻,并返回一个包含重复新闻的列表。
最后,为了实现基于新闻标题的新闻查重功能,我们可以在保存新闻到数据库的时候先检查数据库中是否已经存在相同标题的新闻,如果存在则不保存。以下是修改后的saveToDatabase方法的代码:
```java
private static void saveToDatabase(List<News> newsList) {
try {
Connection conn = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASSWORD);
Class.forName("com.mysql.jdbc.Driver");
String sql = "SELECT * FROM news WHERE title = ?";
PreparedStatement pstmt = conn.prepareStatement(sql);
for (News news : newsList) {
pstmt.setString(1, news.title);
ResultSet rs = pstmt.executeQuery();
if (!rs.next()) {
sql = "INSERT INTO news (category, title, url, datetime) VALUES (?, ?, ?, ?)";
pstmt = conn.prepareStatement(sql);
pstmt.setString(1, news.category);
pstmt.setString(2, news.title);
pstmt.setString(3, news.url);
pstmt.setLong(4, news.datetime);
pstmt.executeUpdate();
}
}
} catch (Exception e) {
System.out.println("Error saving to database: " + e.getMessage());
}
}
```
上述代码在保存到数据库之前先查询数据库中是否已经存在相同标题的新闻,如果存在则不保存。这样就实现了基于新闻标题的新闻查重功能。
阅读全文