for page in range(0, 10):#根据网页规律,用for循环构建 page=page*10 time.sleep(10) #获取正确url网络地址 url = f'https://www.maoyan.com/board/4?timeStamp=20&offset={page}' headers = { 'Cookie':'__mta=250911417.1684852683551.1684857857025.1684857868530.6; ci=59%2C%E6%88%90%E9%83%BD; ci.sig=6-eKn999I8699yCUqmUVkEAA3RA; featrues=[object Object]; featrues.sig=KbQquuOrr42L3kMHbtKc319ems8; _lxsdk_cuid=188490b5844c8-0dc2de72154915-3e604809-144000-188490b5845c8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1684852660; uuid_n_v=v1; uuid=6A080B90F97711EDB1C0EF0226805F3B350AFA999F724C6B9013F8578B9E816D; _csrf=c33def8c4b7063982594d90e32c67030753d41029d577dd8c0b7300fd842a5fd; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk=6A080B90F97711EDB1C0EF0226805F3B350AFA999F724C6B9013F8578B9E816D; __mta=250911417.1684852683551.1684852690037.1684856768008.4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1684857868; _lxsdk_s=188494a0542-07b-504-ec2%7C%7C8', 'Host':'www.maoyan.com', 'Referer':'https://www.maoyan.com/board?timeStamp=1684853042241&channelId=40011&index=1&signKey=683c07ec69bec48fc2589b25eec30cf8&sVersion=1&webdriver=false', #防盗链,表示从哪个页面过来的 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', }
时间: 2024-01-24 17:18:39 浏览: 149
这段代码是构建请求头信息,其中包括Cookie、Host、Referer和User-Agent等信息,这些信息可以帮助我们模拟浏览器发送请求,避免被网站屏蔽。其中的url变量是根据页面规律构建的请求地址,通过for循环遍历页面,每次获取10个电影信息。time.sleep(10)是为了避免频繁访问网站被封禁IP。
相关问题
for i in range(1,10000): while True: try: page = i url = URL.format(page)###################变更点 news_df = GetOnePageInfo(url) news_dfs = news_dfs.append(news_df) news_dfs['Pubtime'] = pd.to_datetime(news_dfs['Pubtime']) print("第{}页爬取完成".format(i)) except: failures += 1 print("模块运行失败,正在重试...") time.sleep(5) continue else: break如何设置continue失败10次后就停止整个循环
你可以在 while 循环中添加一个计数器,当计数器达到 10 时,使用 `break` 语句退出整个 for 循环。以下是修改后的代码示例:
```
failures = 0
for i in range(1,10000):
count = 0
while True:
try:
page = i
url = URL.format(page)
news_df = GetOnePageInfo(url)
news_dfs = news_dfs.append(news_df)
news_dfs['Pubtime'] = pd.to_datetime(news_dfs['Pubtime'])
print("第{}页爬取完成".format(i))
except:
failures += 1
count += 1
print("模块运行失败,正在重试...")
time.sleep(5)
if count == 10:
break # 当计数器达到 10 时,退出整个 for 循环
continue
else:
break
```
在这个示例中,我们添加了一个名为 `count` 的计数器,并在每次 `except` 块中将其增加 1。如果 `count` 的值达到 10,则使用 `break` 语句退出整个 for 循环。
for i in range(1,10000): count = 0 while True: try: page = i url = URL.format(page)###################变更点 news_df = GetOnePageInfo(url) news_dfs = news_dfs.append(news_df) news_dfs['Pubtime'] = pd.to_datetime(news_dfs['Pubtime']) print("第{}页爬取完成".format(i)) except: failures += 1 count += 1 print("模块运行失败,正在重试...") time.sleep(5) if count == 5: break # 当计数器达到 5 时,退出整个 for 循环 continue else: break if news_dfs['Pubtime'].min() < pd.to_datetime('2022-06-01'): break如何设置cintinye失败10次后就停止整个循环执行下面的模块
你可以在while循环中添加一个计数器,每次出现异常时计数器加1,当计数器达到10时,使用break终止整个for循环,然后继续执行下面的模块。代码示例如下:
```python
failures = 0
for i in range(1,10000):
count = 0
while True:
try:
page = i
url = URL.format(page)
news_df = GetOnePageInfo(url)
news_dfs = news_dfs.append(news_df)
news_dfs['Pubtime'] = pd.to_datetime(news_dfs['Pubtime'])
print("第{}页爬取完成".format(i))
break
except:
failures += 1
count += 1
print("模块运行失败,正在重试...")
time.sleep(5)
if count == 10:
break # 当计数器达到 10 时,退出整个 for 循环
if count == 10:
break
if news_dfs['Pubtime'].min() < pd.to_datetime('2022-06-01'):
break
```
这样,当连续出现10次异常时,就会终止整个for循环,然后执行下面的模块。