# 读取数据 df = pd.read_csv(city+'_'+kind+'_'+'lagou.csv', encoding='utf-8') # 数据清洗,剔除实习岗位 df.drop(df[df['职位名称'].str.contains('实习')].index,inplace=True) pattern = '\d+' df['work_year'] = df['工作经验'].str.findall(pattern) print('work_year:', df['work_year']) # 数据处理后的工作年限 avg_work_year = [] # 工作年限 for i in df['work_year']: # 如果工作经验为'不限'或应届毕业生',那么匹配值为空,工作年限为0 if len(i) == 0: avg_work_year.append(0) # 如果匹配值为一个数值,那么返回该数值 elif len(i) == 1: avg_work_year.append(int(''.join(i))) # 如果匹配值为一个区间,那么取平均值 else: num_list = [int(j) for j in i] avg_year = sum(num_list)/2 avg_work_year.append(avg_year) print('avg_work_year:',avg_work_year) df['工作经验'] = avg_work_year df['salary'] = df['工资'].str.findall(pattern) # 月薪 avg_salary = [] for k in df['salary']: int_list = [int(n) for n in k] # 存在工资只有一个区间,[20],所以用[-1]. avg_wage = int_list[0] + (int_list[-1] - int_list[0]) / 4 avg_salary.append(avg_wage) df['月工资'] = avg_salary df['学历要求'] = df['学历要求'].replace('不限', '大专') df.to_csv(city+'_'+kind+'_'+'lagou_finall.csv', index=False, encoding='utf-8_sig') plt.rcParams['font.sans-serif'] = ['simhei'] plt.rcParams['font.serif'] = ['simhei']

拉勾数据采集_Lagou.zip

拉勾数据采集_Lagou

lagou_data.zip

1. **数据**：这显然是一份结构化的数据集，可能包括表格形式的数据，如CSV或Excel文件，用于存储各种信息。 2. **岗位信息**：数据中应包含各个职位的详细描述，如职位名称、职位类别、职责要求等，这些信息可以...

search_job_result += page_kind_job print(search_job_result) print('第{}页数据爬取完毕, 目前职位总数:{}'.format(i, len(search_job_result))) # 每次抓取完成后,暂停一会,防止被服务器拉黑 time.sleep(15) df = pd.DataFrame(data=search_job_result, columns=['公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '工资', '职位福利']) df.to_csv(city+'_'+kind+'_'+'lagou.csv', index=False, encoding='utf-8_sig')

这段代码是用来爬取拉钩网上某个城市某个职位类型的职位信息，并将...需要注意的是，代码中的city和kind变量需要根据自己实际爬取的城市和职位类型进行修改。另外，代码中的15秒暂停时间也可以根据自己的情况进行调整。

def get_json(kind, city, page=1,): data = { 'first':"true", 'pn':page, 'kd':kind } kw = {'px':'default', 'city':city, 'needAddtionalResult':'false'} user_agent_list = ['Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'] user_agent = random.choice(user_agent_list) print('1:',user_agent) header = { 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'User-Agent': user_agent } print(header) # 设置代理 proxies = [ {'http': '140.143.96.216:80', 'https': '140.143.96.216:80'}, {'http': '119.27.177.169:80', 'https': '119.27.177.169:80'}, {'http': '221.7.255.168:8080', 'https': '221.7.255.168:8080'} ] # 请求的url # url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false' url = 'https://www.lagou.com/wn/zhaopin?pn=' #使用代理访问 # response = requests.post(url, headers=header, data=param, proxies=proxies) response = requests.post(url, params=kw, headers=header, data=data) print(response.text) response.encoding='utf-8' if response.status_code == 200: response = response.json() return response['content']['positionResult'] return None

- kind：职位类型，比如python、java等 - city：城市名称，比如北京、上海等 - page：页码，可选参数，默认值为1 函数会返回拉钩网站上对应页码、城市和职位类型的职位信息。它使用了随机的User-Agent和代理访问...

sqlContext.read().jdbc(DB_URL, "lagou_data", getDBProperties());解释这段代码

这段代码是使用Spark SQL中的sqlContext对象从JDBC连接读取数据。具体地，该函数使用DB_URL指定的JDBC连接从名为"lagou_data"的数据表中读取数据，并使用getDBProperties()函数返回的属性进行数据库连接。

DataFrame lagou_data = sqlContext .sql("select financeStage as name,count() as value from lagou_data group by financeStage order by count() desc limit 10");这段代码有问题吗

根据代码片段的可读性，很难...但是，以“lagou_data”作为表名的DataFrame在查询中被引用，如果该表不存在，则会出现代码错误。另外，从查询结果的角度来看，如果financeStage包含null值，则可能会影响结果的正确性。

sqlContext.read().jdbc(DB_URL, "lagou_data", getDBProperties())怎么执行sql代码，返回结果

ChitGPT 无法执行 SQL 代码，它...在您的代码中，使用 sqlContext.read().jdbc 方法来连接数据库并返回结果。您可以在方法中指定数据库连接 URL、表名和设置数据库属性。具体实现取决于您使用的数据库和编程语言。

目标网页：https://www.lagou.com/zhaopin/ 完成重庆市计算机行业工作的查找，月薪5k-10k 要求调用到的方法： 1.下拉菜单的选择； 2.xpath定位

city_select = driver.find_element_by_xpath("//div[@data-property='city']") city_select.click() city_options = Select(city_select.find_element_by_tag_name('select')) city_options.select_by_visible_text...

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 6, in <module> driver.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath'

driver.get('https://www.lagou.com/') element = driver.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') # 对找到的元素进行操作... driver.quit() 请注意，为了正常使用 Selenium，您需要确保...

python3 读取csv

with open(filename, encoding='utf-8') as f: reader = csv.reader(f) header = next(reader) data = \[row for row in reader\] except csv.Error as e: print("Error reading CSV file at line %s:%s" % ...

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 6, in <module> web.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath'

driver.get('https://www.lagou.com/') element = driver.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') # 对找到的元素进行操作... driver.quit() 在这个示例中，我们首先创建了一个Chrome ...

目标网页:https://www.lagou.com/zhaopin/ 完成重庆市计算机行业工作的查找,月薪5k-10k 要求调用到的方法: 1.下拉菜单的选择; 2.xpath定位使用python语言实现

city_input = driver.find_element_by_xpath('//div[@data-lg-tj-id="8FB0"]/input') city_input.click() city_input.send_keys('重庆') time.sleep(1) city_input.send_keys(Keys.ENTER) # 选择计算机/互联网行业 ...

from selenium.webdriver import Chrome import time web=Chrome() xpath='chromedriver.exe' web.get('http://lagou.com')

web.get('http://lagou.com') 请注意，此示例假设您在同一目录中放置了Chrome驱动程序文件“chromedriver.exe”。在实际使用中，您可以更改Chrome驱动程序文件的路径以适应您的环境。希望以上回答能够帮助您...

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 5, in <module> web.get("https://www.lagou.com/") TypeError: WebDriver.get() missing 1 required positional argument: 'url'

这个错误是因为 get() 方法需要传入一个参数，即要访问的URL地址。您需要将要访问的网页URL作为参数传递给 get() 方法。请将以下代码示例中的 'https://www.lagou.com/' 替换为您要访问的实际URL地址： ...

exsle和csv区别

# 获取文件内容 from re import findall with open('files/lagou.csv', encoding='utf-8') as f: reader = csv.DictReader(f) # 处理数据 cities = {} for job in reader: c = job\['city'\] m = job\['salary'\] #...

from selenium.webdriver import Chrome from selenium.webdriver.chrome.service import Service service = Service('chromedriver.exe') web = Chrome(service=service) web.get('http://lagou.com')

好的，我理解了您的问题。根据您的代码，我可以看出您使用了 Selenium 库中的 Chrome 浏览器驱动来打开拉勾网站。这是一个自动化测试工具，通常用于模拟用户操作来测试网站的功能和UI。请问您还有其他问题吗？

相关推荐

拉勾数据采集_Lagou.zip

lagou_data.zip

（二）招聘数据分析 1、数据准备（可从网站爬取数据） 读取拉勾网的数据分析职位数据 lagou_data.csv，如下表，查看数据基本信息。

lagou_recruitment.csv

python 数据加载读取csv

sqlContext.read().jdbc(DB_URL, "lagou_data", getDBProperties());解释这段代码

DataFrame lagou_data = sqlContext .sql("select financeStage as name,count(*) as value from lagou_data group by financeStage order by count(*) desc limit 10");这段代码有问题吗

sqlContext.read().jdbc(DB_URL, "lagou_data", getDBProperties())怎么执行sql代码，返回结果

目标网页：https://www.lagou.com/zhaopin/ 完成重庆市计算机行业工作的查找，月薪5k-10k 要求调用到的方法： 1.下拉菜单的选择； 2.xpath定位

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 6, in <module> driver.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath'

python3 读取csv

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 6, in <module> web.find_element_by_xpath('//*[@id="changeCityBox"]/p[1]/a') AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath'

目标网页:https://www.lagou.com/zhaopin/ 完成重庆市计算机行业工作的查找,月薪5k-10k 要求调用到的方法: 1.下拉菜单的选择; 2.xpath定位 使用python语言实现

from selenium.webdriver import Chrome import time web=Chrome() xpath='chromedriver.exe' web.get('http://lagou.com')

Traceback (most recent call last): File "D:\pythonProjects\pachong\selenium\ex1_lagou.py", line 5, in <module> web.get("https://www.lagou.com/") TypeError: WebDriver.get() missing 1 required positional argument: 'url'

exsle和csv区别

from selenium.webdriver import Chrome from selenium.webdriver.chrome.service import Service service = Service('chromedriver.exe') web = Chrome(service=service) web.get('http://lagou.com')

最新推荐

python制作爬虫并将抓取结果保存到excel中

Python 查看主机IP及mac地址

MATLAB实现小波阈值去噪：Visushrink硬软算法对比

管理建模和仿真的文件

【交互特征的影响】：分类问题中的深入探讨，如何正确应用交互特征

c语言从链式队列 中获取头部元素并返回其状态的函数怎么写

易语言实现画板图像缩放功能教程

"互动学习：行动中的多样性与论文攻读经历"

【交互特征：优化与调试的艺术】：实战技巧，提升回归模型与分类模型的性能

用IDEA写一个高速收费系统框架附带代码

（二）招聘数据分析 1、数据准备（可从网站爬取数据）读取拉勾网的数据分析职位数据 lagou_data.csv，如下表，查看数据基本信息。

DataFrame lagou_data = sqlContext .sql("select financeStage as name,count() as value from lagou_data group by financeStage order by count() desc limit 10");这段代码有问题吗

目标网页:https://www.lagou.com/zhaopin/ 完成重庆市计算机行业工作的查找,月薪5k-10k 要求调用到的方法: 1.下拉菜单的选择; 2.xpath定位使用python语言实现

c语言从链式队列中获取头部元素并返回其状态的函数怎么写