def remove_duplicate_columns(df): df2=df.copy() for i in range(df.shape[1]): for n in range(i+1,df.shape[1]): if df[df.columns[i]].equals([df.columns[n]])==False: continue if df[df.columns[i]].equals([df.columns[n]])==True: del df2[df.columns[i]] return df2 what is wrong with my codes in order to Write a function named remove_duplicate_columns to find the duplicate columns (all values are the same) of a given Pandas DataFrame (all column names are different) and remove the duplicated columns (keep the one that first appears in the DataFrame). The input for the function is a Pandas Dataframe and the output should be a new DataFrame with the duplicate columns removed. For example, if df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [5, 8, 6], 'col3': [7, 9, 8], 'col4': [5, 8, 6]}) then remove_duplicate_columns(df) should return pd.DataFrame({'col1': [1, 2, 3], 'col2': [5, 8, 6], 'col3': [7, 9, 8]), because 'col4' has the same elements as 'col2'. Note that the input should not be modified. (hint: dataframe1.equals(dataframe2) can be used to test whether two dataframes (dataframe1 and dataframe2) contain the same elements).

时间: 2023-11-22 08:57:05 浏览: 134
There are a couple of issues with your code: 1. In the line `if df[df.columns[i]].equals([df.columns[n]])==False:`, you are comparing the values of two columns, but you are wrapping one of them in a list. You should remove the square brackets around `df.columns[n]`. 2. You are deleting columns from `df2` inside the loop, which can cause problems with the indexing. It's better to create a list of columns to delete and then delete them outside the loop. Here is a corrected version of your code: ``` def remove_duplicate_columns(df): df2 = df.copy() columns_to_delete = [] for i in range(df.shape[1]): for n in range(i+1,df.shape[1]): if df[df.columns[i]].equals(df[df.columns[n]]) == True: columns_to_delete.append(df.columns[n]) df2 = df2.drop(columns_to_delete, axis=1) return df2 ``` This function should work correctly for the given problem statement.

相关推荐

解释这段代码 def dropNullAndDropDuplicates(spark: SparkSession, df: DataFrame, schema: StructType, dropKeys: Seq[String], duplicateKeys: Array[String]): (LongAccumulator, LongAccumulator, LongAccumulator, DataFrame) = { val schemaFieldNames: Array[String] = schema.fieldNames if (dropKeys.exists(!schemaFieldNames.contains(_)) || duplicateKeys.exists(!schemaFieldNames.contains(_))) { return (null, null, null, null) } val lineCount: LongAccumulator = spark.sparkContext.longAccumulator("lineCount") val trash: LongAccumulator = spark.sparkContext.longAccumulator("trash") val duplicate: LongAccumulator = spark.sparkContext.longAccumulator("duplicate") val df1: DataFrame = df.select( df.columns.map(name => col(name).as(name.trim.toLowerCase)): _* ) val df1FieldNames: Array[String] = df1.schema.fieldNames val df2: DataFrame = { var tmp: DataFrame = df1 schema.fieldNames.filterNot(df1FieldNames.contains).foreach( fieldName => tmp = tmp.withColumn(fieldName, lit(literal = null)) ) tmp.select( schema.fields .map(structField => tmp.col(structField.name).cast(structField.dataType)): _* ) }.withColumn(colName = "index", monotonically_increasing_id()) val df3: DataFrame = df2.filter(row => { lineCount.add(1) if (dropKeys.exists(key => row.get(row.fieldIndex(key)) == null)) { trash.add(1) false } else { true } }) val df4: DataFrame = df3.groupByKey(row => duplicateKeys.map(key => row.get(row.fieldIndex(key)).toString).mkString("-") )(Encoders.STRING).reduceGroups((row1, row2) => { duplicate.add(1) val defect1 = row1.toSeq.count(_ == null) val defect2 = row2.toSeq.count(_ == null) if (defect1 < defect2) row1 else if (defect1 > defect2) row2 else if (row1.getLong(row1.fieldIndex(name = "index")) > row2.getLong(row1.fieldIndex(name = "index"))) row1 else row2 }).map(_._2)(RowEncoder(df3.schema)) .toDF .drop("index") (lineCount, trash, duplicate, df4) }

import numpy import numpy as np import matplotlib.pyplot as plt import math import torch from torch import nn from torch.utils.data import DataLoader, Dataset import os os.environ['KMP_DUPLICATE_LIB_OK']='True' dataset = [] for data in np.arange(0, 3, .01): data = math.sin(data * math.pi) dataset.append(data) dataset = np.array(dataset) dataset = dataset.astype('float32') max_value = np.max(dataset) min_value = np.min(dataset) scalar = max_value - min_value print(scalar) dataset = list(map(lambda x: x / scalar, dataset)) def create_dataset(dataset, look_back=3): dataX, dataY = [], [] for i in range(len(dataset) - look_back): a = dataset[i:(i + look_back)] dataX.append(a) dataY.append(dataset[i + look_back]) return np.array(dataX), np.array(dataY) data_X, data_Y = create_dataset(dataset) train_X, train_Y = data_X[:int(0.8 * len(data_X))], data_Y[:int(0.8 * len(data_Y))] test_X, test_Y = data_Y[int(0.8 * len(data_X)):], data_Y[int(0.8 * len(data_Y)):] train_X = train_X.reshape(-1, 1, 3).astype('float32') train_Y = train_Y.reshape(-1, 1, 3).astype('float32') test_X = test_X.reshape(-1, 1, 3).astype('float32') train_X = torch.from_numpy(train_X) train_Y = torch.from_numpy(train_Y) test_X = torch.from_numpy(test_X) class RNN(nn.Module): def __init__(self, input_size, hidden_size, output_size=1, num_layer=2): super(RNN, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.num_layer = num_layer self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) self.linear = nn.Linear(hidden_size, output_size) def forward(self, x): out, h = self.rnn(x) out = self.linear(out[0]) return out net = RNN(3, 20) criterion = nn.MSELoss(reduction='mean') optimizer = torch.optim.Adam(net.parameters(), lr=1e-2) train_loss = [] test_loss = [] for e in range(1000): pred = net(train_X) loss = criterion(pred, train_Y) optimizer.zero_grad() # 反向传播 loss.backward() optimizer.step() if (e + 1) % 100 == 0: print('Epoch:{},loss:{:.10f}'.format(e + 1, loss.data.item())) train_loss.append(loss.item()) plt.plot(train_loss, label='train_loss') plt.legend() plt.show()请适当修改代码,并写出预测值和真实值的代码

import numpy as np import matplotlib.pyplot as plt import math import torch from torch import nn import pdb from torch.autograd import Variable import os os.environ['KMP_DUPLICATE_LIB_OK']='True' dataset = [] for data in np.arange(0, 3, .01): data = math.sin(data * math.pi) dataset.append(data) dataset = np.array(dataset) dataset = dataset.astype('float32') max_value = np.max(dataset) min_value = np.min(dataset) scalar = max_value - min_value dataset = list(map(lambda x: x / scalar, dataset)) def create_dataset(dataset, look_back=3): dataX, dataY = [], [] for i in range(len(dataset) - look_back): a = dataset[i:(i + look_back)] dataX.append(a) dataY.append(dataset[i + look_back]) return np.array(dataX), np.array(dataY) data_X, data_Y = create_dataset(dataset) # 对训练集测试集划分,划分比例0.8 train_X, train_Y = data_X[:int(0.8 * len(data_X))], data_Y[:int(0.8 * len(data_Y))] test_X, test_Y = data_Y[int(0.8 * len(data_X)):], data_Y[int(0.8 * len(data_Y)):] train_X = train_X.reshape(-1, 1, 3).astype('float32') train_Y = train_Y.reshape(-1, 1, 3).astype('float32') test_X = test_X.reshape(-1, 1, 3).astype('float32') class RNN(nn.Module): def __init__(self, input_size, hidden_size, output_size=1, num_layer=2): super(RNN, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.num_layer = num_layer self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) self.linear = nn.Linear(hidden_size, output_size) def forward(self, x): # 补充forward函数 out, h = self.rnn(x) out = self.linear(out[0]) # print("output的形状", out.shape) return out net = RNN(3, 20) criterion = nn.MSELoss(reduction='mean') optimizer = torch.optim.Adam(net.parameters(), lr=1e-2) train_loss = [] test_loss = [] for e in range(1000): pred = net(train_X) loss = criterion(pred, train_Y) optimizer.zero_grad() # 反向传播 loss.backward() optimizer.step() if (e + 1) % 100 == 0: print('Epoch:{},loss:{:.10f}'.format(e + 1, loss.data.item())) train_loss.append(loss.item()) plt.plot(train_loss, label='train_loss') plt.legend() plt.show()画出预测值真实值图

最新推荐

recommend-type

mysql error:#1062 Duplicate entry ‘***′ for key 1问题解决方法

今天公司的一个网站突然提示MySQL Error Duplicate entry '96624' for key 1错误,经过分析这个问题是由于mysql表中的一个id自增长字段导致。
recommend-type

安装NumPy教程-详细版

附件是安装NumPy教程_详细版,文件绿色安全,请大家放心下载,仅供交流学习使用,无任何商业目的!
recommend-type

zigbee-cluster-library-specification

最新的zigbee-cluster-library-specification说明文档。
recommend-type

管理建模和仿真的文件

管理Boualem Benatallah引用此版本:布阿利姆·贝纳塔拉。管理建模和仿真。约瑟夫-傅立叶大学-格勒诺布尔第一大学,1996年。法语。NNT:电话:00345357HAL ID:电话:00345357https://theses.hal.science/tel-003453572008年12月9日提交HAL是一个多学科的开放存取档案馆,用于存放和传播科学研究论文,无论它们是否被公开。论文可以来自法国或国外的教学和研究机构,也可以来自公共或私人研究中心。L’archive ouverte pluridisciplinaire
recommend-type

实现实时数据湖架构:Kafka与Hive集成

![实现实时数据湖架构:Kafka与Hive集成](https://img-blog.csdnimg.cn/img_convert/10eb2e6972b3b6086286fc64c0b3ee41.jpeg) # 1. 实时数据湖架构概述** 实时数据湖是一种现代数据管理架构,它允许企业以低延迟的方式收集、存储和处理大量数据。与传统数据仓库不同,实时数据湖不依赖于预先定义的模式,而是采用灵活的架构,可以处理各种数据类型和格式。这种架构为企业提供了以下优势: - **实时洞察:**实时数据湖允许企业访问最新的数据,从而做出更明智的决策。 - **数据民主化:**实时数据湖使各种利益相关者都可
recommend-type

解答下列问题:S—>S;T|T;T—>a 构造任意项目集规范族,构造LR(0)分析表,并分析a;a

对于这个文法,我们可以构造以下项目集规范族: I0: S -> .S S -> .T T -> .a I1: S -> S. [$ T -> T. [$ I2: S -> T. I3: S -> S.;S S -> S.;T T -> T.;a 其中,点(.)表示已经被扫描过的符号,;$表示输入串的结束符号。 根据项目集规范族,我们可以构造出LR(0)分析表: 状态 | a | $ ---- | - | - I0 | s3| I1 | |acc I2 | | 其中s3表示移进到状态3,acc表示接受。在分析字符串a;a时,我们可以按照以下步骤进行
recommend-type

JSBSim Reference Manual

JSBSim参考手册,其中包含JSBSim简介,JSBSim配置文件xml的编写语法,编程手册以及一些应用实例等。其中有部分内容还没有写完,估计有生之年很难看到完整版了,但是内容还是很有参考价值的。
recommend-type

"互动学习:行动中的多样性与论文攻读经历"

多样性她- 事实上SCI NCES你的时间表ECOLEDO C Tora SC和NCESPOUR l’Ingén学习互动,互动学习以行动为中心的强化学习学会互动,互动学习,以行动为中心的强化学习计算机科学博士论文于2021年9月28日在Villeneuve d'Asq公开支持马修·瑟林评审团主席法布里斯·勒菲弗尔阿维尼翁大学教授论文指导奥利维尔·皮耶昆谷歌研究教授:智囊团论文联合主任菲利普·普雷教授,大学。里尔/CRISTAL/因里亚报告员奥利维耶·西格德索邦大学报告员卢多维奇·德诺耶教授,Facebook /索邦大学审查员越南圣迈IMT Atlantic高级讲师邀请弗洛里安·斯特鲁布博士,Deepmind对于那些及时看到自己错误的人...3谢谢你首先,我要感谢我的两位博士生导师Olivier和Philippe。奥利维尔,"站在巨人的肩膀上"这句话对你来说完全有意义了。从科学上讲,你知道在这篇论文的(许多)错误中,你是我可以依
recommend-type

实现实时监控告警系统:Kafka与Grafana整合

![实现实时监控告警系统:Kafka与Grafana整合](https://imgconvert.csdnimg.cn/aHR0cHM6Ly9tbWJpei5xcGljLmNuL21tYml6X2pwZy9BVldpY3ladXVDbEZpY1pLWmw2bUVaWXFUcEdLT1VDdkxRSmQxZXB5R1lxaWNlUjA2c0hFek5Qc3FyRktudFF1VDMxQVl3QTRXV2lhSWFRMEFRc0I1cW1ZOGcvNjQw?x-oss-process=image/format,png) # 1.1 Kafka集群架构 Kafka集群由多个称为代理的服务器组成,这
recommend-type

mac上和window原生一样的历史剪切板工具有什么

好的,我明白了您的要求。关于您的问题,为使Mac与Windows的历史剪切板功能类似,可以使用第三方工具"CopyClip"、"Alfred"、"Clipboard Manager"等。这些工具可以记录您复制到剪切板的所有内容,而不仅仅是最后一次复制。同时,它们还可以进行搜索和过滤,以便您更快地找到需要的内容。希望这能回答您的问题。