dict = {} for i in df['学历要求']: if i not in dict.keys(): dict[i] = 0 else: dict[i] += 1 index = list(dict.keys()) print('index:',index) num = [] for i in index: num.append(dict[i]) print('num:',num) plt.bar(index, num, width=0.5) plt.savefig(city+'_'+kind+'_'+'学历要求.png') plt.show()
时间: 2023-06-24 21:05:55 浏览: 145
这段代码的作用是读取一个数据集中的学历要求列,统计每个学历要求出现的次数,并将其可视化为柱状图。其中,`df`是数据集,`city`和`kind`分别表示城市和工作类型。下面是代码的详细解释:
- `dict = {}`:定义一个空字典`dict`,用于存储每个学历要求出现的次数。
- `for i in df['学历要求']:`:遍历数据集中的学历要求列,对于每个学历要求进行统计。
- `if i not in dict.keys(): dict[i] = 0 else: dict[i] += 1`:如果该学历要求不在字典`dict`中,则将其加入字典,并设置其出现次数为0;否则将其出现次数加1。
- `index = list(dict.keys())`:获取所有学历要求作为横坐标。
- `num = []`:定义一个空列表`num`,用于存储每个学历要求出现的次数。
- `for i in index: num.append(dict[i])`:遍历所有学历要求,将其出现次数加入列表`num`中。
- `plt.bar(index, num, width=0.5)`:绘制柱状图,横坐标为学历要求,纵坐标为出现次数,每个柱子的宽度为0.5。
- `plt.savefig(city+'_'+kind+'_'+'学历要求.png')`:保存柱状图,以城市、工作类型和学历要求作为文件名。
- `plt.show()`:显示柱状图。
相关问题
if self.config.load_type == "INC": # adhoc hist job do not need to join landing merge table try: landing_merge_df = self.spark.read.format(self.config.destination_file_type). \ load(self.config.destination_data_path) # dataframe for updated records df = df.drop("audit_batch_id", "audit_job_id", "audit_src_sys_name", "audit_created_usr", "audit_updated_usr", "audit_created_tmstmp", "audit_updated_tmstmp") # dataframe for newly inserted records new_insert_df = df.join(landing_merge_df, primary_keys_list, "left_anti") self.logger.info(f"new_insert_df count: {new_insert_df.count()}") new_insert_df = DataSink_with_audit(self.spark).add_audit_columns(new_insert_df, param_dict) update_df = df.alias('l').join(landing_merge_df.alias('lm'), on=primary_keys_list, how="inner") update_df = update_df.select("l.*", "lm.audit_batch_id", "lm.audit_job_id", "lm.audit_src_sys_name", "lm.audit_created_usr", "lm.audit_updated_usr", "lm.audit_created_tmstmp", "lm.audit_updated_tmstmp") self.logger.info(f"update_df count : {update_df.count()}") update_df = DataSink_with_audit(self.spark).update_audit_columns(update_df, param_dict) # dataframe for unchanged records unchanged_df = landing_merge_df.join(df, on=primary_keys_list, how="left_anti") self.logger.info(f"unchanged_records_df count : {unchanged_df.count()}") final_df = new_insert_df.union(update_df).union(unchanged_df) print("final_df count : ", final_df.count()) except AnalysisException as e: if e.desc.startswith('Path does not exist'): self.logger.info('landing merge table not exists. will skip join landing merge') final_df = DataSink_with_audit(self.spark).add_audit_columns(df, param_dict) else: self.logger.error(f'unknown error: {e.desc}') raise e else: final_df = DataSink_with_audit(self.spark).add_audit_columns(df, param_dict) return final_df
这是一段Python代码,其中包含一个类方法的实现。该方法根据配置参数的不同,从一个特定的数据路径中将数据加载到一个Spark DataFrame中,并对该数据进行一些操作,最终返回一个具有审计列的DataFrame。如果配置参数是"INC",则会执行一些数据合并的操作,包括添加、更新和未更改的记录,并对这些记录添加审计列。如果配置参数是其他值,则只会添加审计列。
def classification_report_to_dict(report): lines = report.split('\n') lines = [line.strip() for line in lines if line.strip()] classes = [] class_dict = {} for line in lines[1:]: t = line.split() if len(t) == 1: class_name = t[0] classes.append(class_name) class_dict[class_name] = {} else: class_dict[class_name]['precision'] = float(t[0]) class_dict[class_name]['recall'] = float(t[1]) class_dict[class_name]['f1-score'] = float(t[2]) class_dict[class_name]['support'] = int(t[3]) macro_avg = lines[-3].split() micro_avg = lines[-2].split() class_dict['macro avg'] = {'precision': float(macro_avg[1]), 'recall': float(macro_avg[2]), 'f1-score': float(macro_avg[3]), 'support': int(macro_avg[4])} class_dict['micro avg'] = {'precision': float(micro_avg[1]), 'recall': float(micro_avg[2]), 'f1-score': float(micro_avg[3]), 'support': int(micro_avg[4])} return class_dict def classification_report_from_dict(report_dict): classes = list(report_dict.keys()) classes.remove('macro avg') classes.remove('micro avg') lines = [' precision recall f1-score support\n\n'] for class_name in classes: line = f"{class_name.ljust(15)}{report_dict[class_name]['precision']:.2f} {report_dict[class_name]['recall']:.2f} {report_dict[class_name]['f1-score']:.2f} {report_dict[class_name]['support']:5d}\n" lines.append(line) lines.append('\n') macro_avg = report_dict['macro avg'] line = f"{'macro avg'.ljust(15)}{macro_avg['precision']:.2f} {macro_avg['recall']:.2f} {macro_avg['f1-score']:.2f} {macro_avg['support']:5d}\n" lines.append(line) micro_avg = report_dict['micro avg'] line = f"{'micro avg'.ljust(15)}{micro_avg['precision']:.2f} {micro_avg['recall']:.2f} {micro_avg['f1-score']:.2f} {micro_avg['support']:5d}\n" lines.append(line) report_str = ''.join(lines) return report_str for i, report in enumerate(report): report_dict[f'report_{i + 1}'] = classification_report_to_dict(report) report_df = pd.DataFrame.from_dict(report_dict, orient='index') avg_report_dict = report_df.mean().to_dict() avg_report_str = classification_report_from_dict(avg_report_dict) print(avg_report_str)出现local variable 'class_name' referenced before assignment怎么解决
这个错误通常是因为在使用变量之前没有对其进行初始化或赋值,或者在使用变量之前使用了删除、重命名或声明之前的变量名。在你提供的代码中,我无法确定错误具体出现在哪一行,但是最可能的原因是在 `classification_report_from_dict` 函数中的 `classes` 列表为空,导致 `class_name` 变量没有被初始化。你可以在调用 `classification_report_from_dict` 函数之前,检查 `report_dict` 是否包含正确的数据。你也可以在 `classification_report_from_dict` 函数中添加一些错误处理机制,以避免出现未定义变量的情况。
阅读全文