用MFC写一个爬取ts视频的爬虫
时间: 2023-06-23 07:09:32 浏览: 199
基于flask搭建的python爬虫项目,爬取tx视频并返回对应api.zip
首先,需要了解一下MFC的基础知识和网络编程知识。
MFC是Microsoft Foundation Classes的缩写,是微软提供的一套C++类库,用于开发Windows应用程序。网络编程是指在计算机网络上进行数据交换和通信的过程,包括Socket编程、HTTP协议、TCP/IP协议等。
接下来,我们可以按照以下步骤来编写一个爬取ts视频的爬虫:
1. 创建一个MFC窗口应用程序项目。
2. 添加一个对话框,用于输入要爬取的视频链接。
3. 在对话框类中添加一个按钮,用于触发爬取视频的操作。
4. 在按钮的单击事件中,使用WinHTTP库发送HTTP请求获取视频链接的源代码。
5. 在源代码中解析出视频的ts分片链接,并将这些链接存储到一个数组中。
6. 使用WinHTTP库下载每个分片链接对应的ts文件。
7. 将所有下载的ts文件合并为一个完整的视频文件。
这里需要用到WinHTTP库中的以下函数:
- WinHttpOpen: 打开一个HTTP会话。
- WinHttpConnect: 连接到指定的服务器。
- WinHttpOpenRequest: 创建一个HTTP请求。
- WinHttpSendRequest: 发送HTTP请求。
- WinHttpReceiveResponse: 接收HTTP响应。
- WinHttpQueryHeaders: 获取HTTP响应头信息。
- WinHttpReadData: 读取HTTP响应正文数据。
具体的实现可以参考以下代码示例:
```cpp
void CMyDlg::OnBtnCrawl()
{
CString strUrl;
GetDlgItemText(IDC_EDIT_URL, strUrl);
// 打开HTTP会话
HINTERNET hSession = WinHttpOpen(L"CrawlTSVideo", WINHTTP_ACCESS_TYPE_DEFAULT_PROXY, WINHTTP_NO_PROXY_NAME, WINHTTP_NO_PROXY_BYPASS, 0);
if (!hSession) {
return;
}
// 连接到服务器
URL_COMPONENTS urlComp = {};
urlComp.dwStructSize = sizeof(urlComp);
urlComp.lpszHostName = new TCHAR[INTERNET_MAX_HOST_NAME_LENGTH];
urlComp.dwHostNameLength = INTERNET_MAX_HOST_NAME_LENGTH;
urlComp.lpszUrlPath = new TCHAR[INTERNET_MAX_PATH_LENGTH];
urlComp.dwUrlPathLength = INTERNET_MAX_PATH_LENGTH;
urlComp.lpszScheme = new TCHAR[INTERNET_MAX_SCHEME_LENGTH];
urlComp.dwSchemeLength = INTERNET_MAX_SCHEME_LENGTH;
if (!WinHttpCrackUrl(strUrl, strUrl.GetLength(), 0, &urlComp)) {
WinHttpCloseHandle(hSession);
return;
}
HINTERNET hConnect = WinHttpConnect(hSession, urlComp.lpszHostName, urlComp.nPort, 0);
if (!hConnect) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hSession);
return;
}
// 创建HTTP请求
HINTERNET hRequest = WinHttpOpenRequest(hConnect, L"GET", urlComp.lpszUrlPath, nullptr, WINHTTP_NO_REFERER, WINHTTP_DEFAULT_ACCEPT_TYPES, urlComp.nScheme == INTERNET_SCHEME_HTTPS ? WINHTTP_FLAG_SECURE : 0);
if (!hRequest) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hConnect);
WinHttpCloseHandle(hSession);
return;
}
// 发送HTTP请求
if (!WinHttpSendRequest(hRequest, WINHTTP_NO_ADDITIONAL_HEADERS, 0, WINHTTP_NO_REQUEST_DATA, 0, 0, 0)) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
WinHttpCloseHandle(hSession);
return;
}
// 接收HTTP响应
if (!WinHttpReceiveResponse(hRequest, nullptr)) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
WinHttpCloseHandle(hSession);
return;
}
// 获取HTTP响应头信息
DWORD dwHeaderLen = 0;
WinHttpQueryHeaders(hRequest, WINHTTP_QUERY_RAW_HEADERS_CRLF, WINHTTP_HEADER_NAME_BY_INDEX, nullptr, &dwHeaderLen, WINHTTP_NO_HEADER_INDEX);
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
TCHAR* lpHeaders = new TCHAR[dwHeaderLen / sizeof(TCHAR)];
if (WinHttpQueryHeaders(hRequest, WINHTTP_QUERY_RAW_HEADERS_CRLF, WINHTTP_HEADER_NAME_BY_INDEX, lpHeaders, &dwHeaderLen, WINHTTP_NO_HEADER_INDEX)) {
// 解析出视频的ts分片链接
CStringArray arrUrls;
LPCTSTR lpHeader = lpHeaders;
while (*lpHeader) {
if (_tcsnicmp(lpHeader, _T("http"), 4) == 0) {
LPCTSTR lpEnd = _tcschr(lpHeader, _T('\r'));
if (lpEnd) {
arrUrls.Add(CString(lpHeader, lpEnd - lpHeader));
lpHeader = lpEnd;
continue;
}
}
lpHeader = _tcschr(lpHeader, _T('\r'));
if (lpHeader) {
++lpHeader;
}
}
// 下载每个分片链接对应的ts文件
for (int i = 0; i < arrUrls.GetCount(); ++i) {
CString strFileName;
strFileName.Format(_T("video_%d.ts"), i);
DownloadFile(hSession, arrUrls[i], strFileName);
}
// 合并所有下载的ts文件
MergeFiles(_T("video_%d.ts"), arrUrls.GetCount(), _T("video.ts"));
}
delete[] lpHeaders;
}
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
WinHttpCloseHandle(hSession);
}
void CMyDlg::DownloadFile(HINTERNET hSession, LPCTSTR lpUrl, LPCTSTR lpFileName)
{
// 连接到服务器
URL_COMPONENTS urlComp = {};
urlComp.dwStructSize = sizeof(urlComp);
urlComp.lpszHostName = new TCHAR[INTERNET_MAX_HOST_NAME_LENGTH];
urlComp.dwHostNameLength = INTERNET_MAX_HOST_NAME_LENGTH;
urlComp.lpszUrlPath = new TCHAR[INTERNET_MAX_PATH_LENGTH];
urlComp.dwUrlPathLength = INTERNET_MAX_PATH_LENGTH;
urlComp.lpszScheme = new TCHAR[INTERNET_MAX_SCHEME_LENGTH];
urlComp.dwSchemeLength = INTERNET_MAX_SCHEME_LENGTH;
if (!WinHttpCrackUrl(lpUrl, _tcslen(lpUrl), 0, &urlComp)) {
return;
}
HINTERNET hConnect = WinHttpConnect(hSession, urlComp.lpszHostName, urlComp.nPort, 0);
if (!hConnect) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
return;
}
// 创建HTTP请求
HINTERNET hRequest = WinHttpOpenRequest(hConnect, L"GET", urlComp.lpszUrlPath, nullptr, WINHTTP_NO_REFERER, WINHTTP_DEFAULT_ACCEPT_TYPES, urlComp.nScheme == INTERNET_SCHEME_HTTPS ? WINHTTP_FLAG_SECURE : 0);
if (!hRequest) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hConnect);
return;
}
// 发送HTTP请求
if (!WinHttpSendRequest(hRequest, WINHTTP_NO_ADDITIONAL_HEADERS, 0, WINHTTP_NO_REQUEST_DATA, 0, 0, 0)) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return;
}
// 接收HTTP响应
if (!WinHttpReceiveResponse(hRequest, nullptr)) {
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return;
}
// 将HTTP响应正文数据保存为文件
HANDLE hFile = CreateFile(lpFileName, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr);
if (hFile != INVALID_HANDLE_VALUE) {
BYTE buf[4096];
DWORD dwReadLen = 0;
while (WinHttpReadData(hRequest, buf, sizeof(buf), &dwReadLen) && dwReadLen > 0) {
DWORD dwWrittenLen = 0;
WriteFile(hFile, buf, dwReadLen, &dwWrittenLen, nullptr);
}
CloseHandle(hFile);
}
delete[] urlComp.lpszHostName;
delete[] urlComp.lpszUrlPath;
delete[] urlComp.lpszScheme;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
}
void CMyDlg::MergeFiles(LPCTSTR lpFileNameFmt, int nFileCount, LPCTSTR lpMergedFileName)
{
HANDLE hMergedFile = CreateFile(lpMergedFileName, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, nullptr);
if (hMergedFile != INVALID_HANDLE_VALUE) {
for (int i = 0; i < nFileCount; ++i) {
CString strFileName;
strFileName.Format(lpFileNameFmt, i);
HANDLE hFile = CreateFile(strFileName, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
if (hFile != INVALID_HANDLE_VALUE) {
BYTE buf[4096];
DWORD dwReadLen = 0;
while (ReadFile(hFile, buf, sizeof(buf), &dwReadLen, nullptr) && dwReadLen > 0) {
DWORD dwWrittenLen = 0;
WriteFile(hMergedFile, buf, dwReadLen, &dwWrittenLen, nullptr);
}
CloseHandle(hFile);
}
}
CloseHandle(hMergedFile);
}
for (int i = 0; i < nFileCount; ++i) {
CString strFileName;
strFileName.Format(lpFileNameFmt, i);
DeleteFile(strFileName);
}
}
```
上述代码中,DownloadFile函数用于下载一个ts文件,MergeFiles函数用于合并多个ts文件。
需要注意的是,这里只是一个简单的示例代码,并没有考虑到网络异常、文件存储路径等问题,实际应用中需要根据需求进行完善。
阅读全文