写C++代码 实现从指定网页中抓取中文写作素材
时间: 2024-03-20 08:43:05 浏览: 104
C++实现网页抓取
4星 · 用户满意度95%
抓取网页内容需要用到网络编程和HTML解析技术,这里给出一个基于C++的示例代码,使用了libcurl和libxml2库:
```cpp
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <iostream>
#include <string>
#include <vector>
// 回调函数,处理libcurl返回的网页内容
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
size_t realsize = size * nmemb;
std::string* str = (std::string*)userp;
str->append((char*)contents, realsize);
return realsize;
}
// 解析HTML内容,提取中文文本
static void ParseHtml(const std::string& html, std::vector<std::string>& result) {
htmlDocPtr doc = htmlReadDoc((xmlChar*)(html.c_str()), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) {
std::cerr << "Failed to parse HTML" << std::endl;
return;
}
xmlNodePtr cur = xmlDocGetRootElement(doc);
while (cur && cur->type != XML_ELEMENT_NODE) {
cur = cur->next;
}
while (cur) {
if (cur->type == XML_ELEMENT_NODE) {
if (xmlStrcmp(cur->name, (const xmlChar*)"script") == 0 ||
xmlStrcmp(cur->name, (const xmlChar*)"style") == 0) { // 忽略脚本和样式标签
cur = cur->next;
continue;
}
xmlChar* text = xmlNodeGetContent(cur);
if (text) {
std::string str = (char*)text;
int len = str.length();
for (int i = 0; i < len; i++) {
if (str[i] >= 0x4e00 && str[i] <= 0x9fa5) { // 只保留中文字符
result.push_back(str.substr(i, 1));
}
}
xmlFree(text);
}
}
cur = cur->next;
}
xmlFreeDoc(doc);
}
int main(int argc, char** argv) {
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " url" << std::endl;
return 1;
}
CURL* curl = curl_easy_init();
if (!curl) {
std::cerr << "Failed to initialize libcurl" << std::endl;
return 1;
}
std::string html;
std::vector<std::string> result;
curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &html);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
std::cerr << "Failed to fetch URL: " << curl_easy_strerror(res) << std::endl;
curl_easy_cleanup(curl);
return 1;
}
ParseHtml(html, result);
for (const auto& str : result) {
std::cout << str;
}
std::cout << std::endl;
curl_easy_cleanup(curl);
return 0;
}
```
这个代码可以从命令行接收一个网页链接作为参数,抓取网页内容并解析出中文文本,输出到标准输出。你可以根据自己的需求修改代码,比如改为从文件中读取链接列表,抓取多个网页的内容等。
阅读全文