import requests import os import time import json from tqdm import tqdm import re def taopiaopiao(): headers = { 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Mobile Safari/537.36 Edg/113.0.1774.57' } time.sleep(0.5) url = "https://dianying.taobao.com/showList.htm?spm=a1z21.6646273.city.2.4ed46d6ekOc3wH&n_s=new&city=310100" response = requests.get(url, headers=headers) html = response.text print("网页信息已获取…") time.sleep(0.5) destinationPath = "result.txt" fd = open(destinationPath, "w+", encoding='utf-8') fd.writelines(html) end = html.find('<!-- 即将热映 -->') if end != -1: html = html[:end] #print(html) fd.close() s = '<img width="160" height="224" data-src="(.*?)" src=' + \ '.*?<span class="bt-l">(.+?)</span>.*?<span class="bt-r">(\d.\d)?</span>' + \ ".*?<span>导演:(.*?)</span>" + ".*?<span>主演:(.*?)</span>" + ".*?<span>类型:(.*?)</span>" + \ ".*?<span>地区:(.*?)</span>" + ".*?<span>语言:(.*?)</span>" + ".*?<span>片长:(.*?)</span>" + \ ".*?<a href=(.*?)>" pattern = re.compile(s, re.S) items = re.findall(pattern, html) #print(items) destinationPath = "items.json" fd = open(destinationPath, "w+", encoding='utf-8') json.dump(items, fd) fd.close() dir_name = "./images" if not os.path.exists(dir_name): os.mkdir(dir_name) cnt = 0 for item in tqdm(items): url = item[0] file_name = str(cnt) + ".jpg" cnt += 1 response = requests.get(url, headers=headers) with open(dir_name + "/" + file_name, 'wb') as f: f.write(response.content) info = "图片文件: {0:25}{1}".format(file_name, " 成功下载...") print(info) return items if __name__ == "__main__": taopiaopiao()
时间: 2024-03-08 14:44:56 浏览: 18
这段代码存在一个问题,就是在导入requests模块时没有使用正确的语法。应该在每个导入模块的语句之间添加一个空格。请尝试修改import语句,如下所示:
```
import requests
import os
import time
import json
from tqdm import tqdm
import re
```
这样修改后,代码应该就可以正常运行了。如果还有其他问题,请告诉我。
相关问题
import requests, json import execjs class Baidufanyi(object): def init(self): self.url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en' self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", } def indetna(self, name, shuzi): data = { "query": name, "sign": shuzi } return data def jspojie(self): name = input(str('请输入需要翻译的文字:')) sign = name print(sign) return name, sign def dmet(self, datan): resqone = requests.post(url=self.url, data=datan, headers=self.headers) dict_data = json.loads(resqone.text)["trans_result"]["data"][0]['dst'] print(dict_data) def main(self): korr = self.jspojie() konaem=self.indetna(korr) self.dmet(konaem) if name == 'main': baidu = Baidufanyi() baidu.main()
根据您提供的代码,您在调用`indetna()`方法时缺少了一个必需的参数`shuzi`。您需要在调用`indetna()`方法时传递两个参数,一个是`name`,另一个是`shuzi`。修改您的代码如下:
```python
def main(self):
korr = self.jspojie()
name, sign = self.jspojie() # 获取name和sign
konaem=self.indetna(name, sign) # 传递name和sign给indetna方法
self.dmet(konaem)
```
请注意,您还需要将`if name == 'main':`修改为`if __name__ == '__main__':`以确保在作为脚本直接运行时才执行`main()`方法。这样修改后的代码应该是这样的:
```python
if __name__ == '__main__':
baidu = Baidufanyi()
baidu.main()
```
这样应该可以解决您遇到的错误。请记得检查其他部分的代码是否正确无误。
为这段代码import requests from bs4 import BeautifulSoup import pprint import json def download_all_htmls(): """下载所有列表页面的HTML,用于后续的分析""" htmls = [] headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} for idx in range(24): url = f"https://www.utusan.com.my/page/{idx+1}" print("craw html:", url) r = requests.get(url) if r.status_code != 200: raise Exception("error") htmls.append(r.text) return htmls htmls = download_all_htmls() print(htmls[0]),添加爬取的每页数据单存保存到txt中,txt命名为页面页码的代码,再添加每隔10s爬取20个页面的代码
添加取每页数据并保存到 txt 文件中的代码如下:
```
import requests
from bs4 import BeautifulSoup
import pprint
import json
import time
def download_all_htmls():
"""下载所有列表页面的HTML,用于后续的分析"""
htmls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
for idx in range(24):
url = f"https://www.utusan.com.my/page/{idx+1}"
print("craw html:", url)
r = requests.get(url, headers=headers)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
with open(f"page{idx+1}.txt", "w", encoding="utf-8") as f:
f.write(r.text)
return htmls
htmls = download_all_htmls()
print(htmls[0])
```
在这个例子中,我们使用 `with` 语句打开一个命名为 `page{idx+1}.txt` 的文件,并将每页的 HTML 写入该文件中。
添加每隔 10 秒爬取 20 个页面的代码如下:
```
import requests
from bs4 import BeautifulSoup
import pprint
import json
import time
def download_all_htmls():
"""下载所有列表页面的HTML,用于后续的分析"""
htmls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
for idx in range(24):
url = f"https://www.utusan.com.my/page/{idx+1}"
print("craw html:", url)
r = requests.get(url, headers=headers)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
with open(f"page{idx+1}.txt", "w", encoding="utf-8") as f:
f.write(r.text)
if (idx+1) % 20 == 0:
print("Sleep for 10 seconds...")
time.sleep(10)
return htmls
htmls = download_all_htmls()
print(htmls[0])
```
在这个例子中,我们使用了 `time` 模块中的 `sleep()` 函数,使程序在每爬取 20 个页面后暂停 10 秒钟。这样可以降低爬虫的访问频率,减少被封禁的风险。