1.把这些写入cvs,然后再加个序号。然后是读取,把刚刚写入的疾病名称都读出来,分析一下,两个字的有多少个,三个字的有多少个,四个字的又有多少个。再然后就把两个字的存一个csv,三个字的存一个csv,四个字的存一个csv。统计一下,重复的数据有哪个,把重复的拎出来放一个csv,没有重复的放一个csv。数据分析的话,估计是要做个图表出来,柱状图,折线图这种。
时间: 2023-05-25 21:03:40 浏览: 57
2.首先需要准备好要写入csv的数据,例如:
疾病名称
感冒
肺炎
糖尿病
高血压
脑梗死
3.在Python中使用csv模块将数据写入csv文件并加上序号:
import csv
data = ['感冒', '肺炎', '糖尿病', '高血压', '脑梗死']
with open('diseases.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(data):
writer.writerow([i+1, d])
4.接着读取csv文件中的数据,并对疾病名称进行分析:
import csv
two_words = []
three_words = []
four_words = []
with open('diseases.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # skip header
for row in reader:
disease = row[1]
words_num = len(disease.split(' '))
if words_num == 2:
two_words.append(disease)
elif words_num == 3:
three_words.append(disease)
elif words_num == 4:
four_words.append(disease)
# 统计重复数据
all_diseases = two_words + three_words + four_words
unique_diseases = set(all_diseases)
duplicate_diseases = []
for d in unique_diseases:
if all_diseases.count(d) > 1:
duplicate_diseases.append(d)
5.将分析结果写入对应的csv文件中:
import csv
two_words = []
three_words = []
four_words = []
with open('diseases.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # skip header
for row in reader:
disease = row[1]
words_num = len(disease.split(' '))
if words_num == 2:
two_words.append(disease)
elif words_num == 3:
three_words.append(disease)
elif words_num == 4:
four_words.append(disease)
# 统计重复数据
all_diseases = two_words + three_words + four_words
unique_diseases = set(all_diseases)
duplicate_diseases = []
for d in unique_diseases:
if all_diseases.count(d) > 1:
duplicate_diseases.append(d)
# 写入两个字的疾病名称
with open('two_words.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(two_words):
writer.writerow([i+1, d])
# 写入三个字的疾病名称
with open('three_words.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(three_words):
writer.writerow([i+1, d])
# 写入四个字的疾病名称
with open('four_words.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(four_words):
writer.writerow([i+1, d])
# 写入重复的疾病名称
with open('duplicates.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(duplicate_diseases):
writer.writerow([i+1, d])
# 写入没有重复的疾病名称
unique_diseases = list(unique_diseases)
unique_diseases.sort()
with open('unique.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['序号', '疾病名称'])
for i, d in enumerate(unique_diseases):
writer.writerow([i+1, d])
6.最后可以使用matplotlib等库对数据进行可视化分析,例如:
import csv
import matplotlib.pyplot as plt
two_words = []
three_words = []
four_words = []
with open('diseases.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # skip header
for row in reader:
disease = row[1]
words_num = len(disease.split(' '))
if words_num == 2:
two_words.append(disease)
elif words_num == 3:
three_words.append(disease)
elif words_num == 4:
four_words.append(disease)
# 统计疾病名称长度
lengths = [len(d) for d in two_words + three_words + four_words]
two_words_num = len(two_words)
three_words_num = len(three_words)
four_words_num = len(four_words)
labels = ['2个字', '3个字', '4个字']
values = [two_words_num, three_words_num, four_words_num]
# 绘制柱状图
plt.bar(labels, values)
plt.title('不同长度疾病名称数量统计')
plt.xlabel('名称长度')
plt.ylabel('数量')
plt.show()
# 绘制折线图
unique_diseases = set(two_words + three_words + four_words)
unique_lengths = [len(d) for d in unique_diseases]
unique_lengths.sort()
length_counts = [unique_lengths.count(l) for l in unique_lengths]
plt.plot(unique_lengths, length_counts)
plt.title('疾病名称长度分布图')
plt.xlabel('名称长度')
plt.ylabel('数量')
plt.show()