import requests from lxml import etree urls = ['https://www.xxddxs.com/book/{}.html'.format(i) for i in range(1, 20)] path = r'D:\py\venv\Scripts\xiaoshuo' def get_text(url: object) -> object: r = requests.get(url) r.encoding = 'utf-8' selector = etree.HTML(r.text) # Get the book title from the page title = selector.xpath('//div[@class="bookname"]/h1/text()') # If no title found, print an error message and return early if len(title) == 0: print(f"No title found for {url}") return # Get the text of the chapter text = selector.xpath('//div[@id="content"]/text()') # Print the text to standard output print(text) # Write the text to a file in the specified directory with open(path + "\\" + title[0], 'w', encoding='utf-8') as f: for i in text: f.write(i) if __name__ == '__main__': for url in urls: get_text(url) 帮我修改并可以爬取到标题
时间: 2024-04-04 19:33:54 浏览: 270
from lxml import etree
Sure! 以下是修改后的代码:
```python
import requests
from lxml import etree
urls = ['https://www.xxddxs.com/book/{}.html'.format(i) for i in range(1, 20)]
path = r'D:\py\venv\Scripts\xiaoshuo'
def get_text(url):
r = requests.get(url)
r.encoding = 'utf-8'
selector = etree.HTML(r.text)
# Get the book title from the page
title = selector.xpath('//div[@class="bookname"]/h1/text()')
# If no title found, print an error message and return early
if len(title) == 0:
print(f"No title found for {url}")
return
# Get the text of the chapter
text = selector.xpath('//div[@id="content"]/text()')
# Print the text to standard output
print(text)
# Write the text to a file in the specified directory
with open(path + "\\" + title[0].strip() + ".txt", 'w', encoding='utf-8') as f:
for i in text:
f.write(i)
if __name__ == '__main__':
for url in urls:
get_text(url)
```
在原有代码的基础上,我在 `open()` 方法中添加了 `.strip()`,以去除标题中的空白字符。此外,我还为文件添加了 `.txt` 扩展名,以方便后续的操作。
阅读全文