import codecs # 创建一个变量并存储我们要搜索的文本 search_text = "F:" # 创建一个变量并存储我们要添加的文本 replace_text = "\pc-20190606" # 使用 open() 函数以只读模式打开我们的文本文件 with open(r'林查胶印ID号.txt', 'r', encoding='UTF-8') as file: # 使用 read() 函数读取文件内容并将它们存储在一个新变量中 data = file.read() # 使用 replace() 函数搜索和替换文本 data = data.replace(search_text, replace_text) # 以只写模式打开我们的文本文件以写入替换的内容 with open(r'林查胶印ID号.txt', 'w', encoding='UTF-8') as file: # 在我们的文本文件中写入替换的数据 file.write(data) # 打印文本已替换 print("文本已替换") bfile = codecs.open("林查胶印ID号.txt", 'r', "utf-8") ss = bfile.read() # print (ss) bfile.close() s = codecs.open("林查胶印ID号.txt", "w", "utf-16") s.write(ss) s.close()

时间: 2023-04-09 20:04:31 浏览: 67

import codecs 是 Python 中的一个模块，用于处理不同编码的文本文件。它提供了一些函数，可以将文本文件从一种编码转换为另一种编码，或者读取和写入不同编码的文本文件。这个模块在处理中文等非 ASCII 字符时非常有用。

import codecs # 创建一个变量并存储我们要搜索的文本 search_text = "F:\" # 创建一个变量并存储我们要添加的文本 replace_text = "\\pc-20190606" # 使用 open() 函数以只读模式打开我们的文本文件 with open(r'林查胶印ID号.txt', 'r', encoding='UTF-8') as file: # 使用 read() 函数读取文件内容并将它们存储在一个新变量中 data = file.read() # 使用 replace() 函数搜索和替换文本 data = data.replace(search_text, replace_text) # 以只写模式打开我们的文本文件以写入替换的内容 with open(r'林查胶印ID号.txt', 'w', encoding='UTF-8') as file: # 在我们的文本文件中写入替换的数据 file.write(data) # 打印文本已替换 print("文本已替换") bfile = codecs.open("林查胶印ID号.txt", 'r', "utf-8") ss = bfile.read() # print (ss) bfile.close() s = codecs.open("林查胶印ID号.txt", "w", "utf-16") s.write(ss) s.close() 帮我找出bug

import codecs是Python中的一个模块，用于处理不同编码的文本文件。它提供了一些编码和解码的函数，可以将文本文件从一种编码格式转换为另一种编码格式，以便在不同的操作系统和应用程序之间进行交互。

import sys import re import jieba import codecs import gensim import numpy as np import pandas as pd def segment(doc: str): stop_words = pd.read_csv('data/stopwords.txt', index_col=False, quoting=3, names=['stopword'], sep='\n', encoding='utf-8') stop_words = list(stop_words.stopword) reg_html = re.compile(r'<[^>]+>', re.S) # 去掉html标签数字等 doc = reg_html.sub('', doc) doc = re.sub('[０-９]', '', doc) doc = re.sub('\s', '', doc) word_list = list(jieba.cut(doc)) out_str = '' for word in word_list: if word not in stop_words: out_str += word out_str += ' ' segments = out_str.split(sep=' ') return segments def doc2vec(file_name, model): start_alpha = 0.01 infer_epoch = 1000 doc = segment(codecs.open(file_name, 'r', 'utf-8').read()) vector = model.docvecs[doc_id] return model.infer_vector(doc) # 计算两个向量余弦值 def similarity(a_vect, b_vect): dot_val = 0.0 a_norm = 0.0 b_norm = 0.0 cos = None for a, b in zip(a_vect, b_vect): dot_val += a * b a_norm += a 2 b_norm += b 2 if a_norm == 0.0 or b_norm == 0.0: cos = -1 else: cos = dot_val / ((a_norm * b_norm) ** 0.5) return cos def test_model(file1, file2): print('导入模型') model_path = 'tmp/zhwk_news.doc2vec' model = gensim.models.Doc2Vec.load(model_path) vect1 = doc2vec(file1, model) # 转成句子向量 vect2 = doc2vec(file2, model) print(sys.getsizeof(vect1)) # 查看变量占用空间大小 print(sys.getsizeof(vect2)) cos = similarity(vect1, vect2) print('相似度：%0.2f%%' % (cos * 100)) if name == 'main': file1 = 'data/corpus_test/t1.txt' file2 = 'data/corpus_test/t2.txt' test_model(file1, file2) 有什么问题，怎么解决

在 `doc2vec()` 函数中，你在尝试访问 `doc_id` 变量，但是该变量未定义，这会导致 `NameError` 错误。你需要将该变量定义为函数的参数，并在调用函数时传递文档的标识符。另外，在 `doc2vec()` 函数中，你在尝试将 `Doc2Vec` 模型返回的向量赋值给 `vector` 变量，但是该变量没有使用，这会导致不必要的计算和内存占用。你应该直接返回模型返回的向量。最后，在 `test_model()` 函数中，你在尝试打印向量的内存占用大小，但是你应该打印向量本身的大小，而不是指向向量的变量的大小。你可以使用 `np.ndarray.nbytes` 属性来获取向量的大小。下面是修改后的代码： ``` import sys import re import jieba import codecs import gensim import numpy as np import pandas as pd def segment(doc: str): stop_words = pd.read_csv('data/stopwords.txt', index_col=False, quoting=3, names=['stopword'], sep='\n', encoding='utf-8') stop_words = list(stop_words.stopword) reg_html = re.compile(r'<[^>]+>', re.S) # 去掉html标签数字等 doc = reg_html.sub('', doc) doc = re.sub('[０-９]', '', doc) doc = re.sub('\s', '', doc) word_list = list(jieba.cut(doc)) out_str = '' for word in word_list: if word not in stop_words: out_str += word out_str += ' ' segments = out_str.split(sep=' ') return segments def doc2vec(file_name, model, doc_id): start_alpha = 0.01 infer_epoch = 1000 doc = segment(codecs.open(file_name, 'r', 'utf-8').read()) return model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch) # 计算两个向量余弦值 def similarity(a_vect, b_vect): dot_val = 0.0 a_norm = 0.0 b_norm = 0.0 cos = None for a, b in zip(a_vect, b_vect): dot_val += a * b a_norm += a ** 2 b_norm += b ** 2 if a_norm == 0.0 or b_norm == 0.0: cos = -1 else: cos = dot_val / ((a_norm * b_norm) ** 0.5) return cos def test_model(file1, file2): print('导入模型') model_path = 'tmp/zhwk_news.doc2vec' model = gensim.models.Doc2Vec.load(model_path) vect1 = doc2vec(file1, model, doc_id=0) # 转成句子向量 vect2 = doc2vec(file2, model, doc_id=1) print(vect1.nbytes) # 查看向量大小 print(vect2.nbytes) cos = similarity(vect1, vect2) print('相似度：%0.2f%%' % (cos * 100)) if __name__ == '__main__': file1 = 'data/corpus_test/t1.txt' file2 = 'data/corpus_test/t2.txt' test_model(file1, file2) ```

相关推荐

抓绒：F＃的Json映射器

python实现的一个中文文本摘要程序.pdf

ArabicProcessingCog:一个Python软件包，可为阿拉伯语做词干，标记，断句，分段，规范化，POS标记

怎么办？ decoded_text = base64.b64decode(ciphertext).decode("hex") LookupError: 'hex' is not a text encoding; use codecs.decode() to handle arbitrary codecs

我有三个TXT文件在D盘的一个文件夹里，文本内容为很多汉字。怎么用python读取它们？写出代码（我后期要对他们进行预处理）

一个二进制文件中含有非ascii字符，怎么正确用python解码

python把一个文件转换为utf-8格式

with codecs.open(input_file, 'r', 'utf-8') as infs:可以打开json文件吗

如何使用python 在gbk环境中 将 'æ— ' 表示为汉字

python 读取文件夹中所有文件名字并存入,txt文件

如何用python打开一个二进制文件，它使用多种编码格式混合而成，如何使用GB2312,GB18030,GBK,BIG5,unicode，utf-8,utf-16 be,utf-16le格式，两个字节一组检查是否符合汉字编码，如果连续2个符合某个编码，就打印出来

codecs txt导入到pandas excel

上面的代码中的source_folder 请改成input输入的形式，可以更改的输入地址

最新推荐

zigbee-cluster-library-specification

管理建模和仿真的文件

实现实时数据湖架构：Kafka与Hive集成

可见光定位LED及其供电硬件具体型号，广角镜头和探测器，实验设计具体流程步骤，

JSBSim Reference Manual

"互动学习：行动中的多样性与论文攻读经历"

实现实时监控告警系统：Kafka与Grafana整合

解释这行代码 c = ((double)rand() / RAND_MAX) * (a + b - fabs(a - b)) + fabs(a - b);

c++校园超市商品信息管理系统课程设计说明书(含源代码) (2).pdf

关系数据表示学习

如何使用python 在gbk环境中将 'æ— ' 表示为汉字