详细逐步解释下列代码:import os.path import re import yaml import csv from tasly import builder_utils ############################ # IntAct - MutationDs # ############################ def parser(databases_directory, download=True): relationships = set() # 加载yml文件 with open('./yml/mutationDsConfig.yml', 'r') as f: config = yaml.safe_load(f) header = config['header'] output_file_name = "mutation_curated_affects_interaction_with.csv" regex = r":(\w+)\(" url = config['mutations_url'] directory = os.path.join(databases_directory, "MutationDs") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(file_name, 'r', encoding='utf-8') as mf: first = True for line in mf: if first: first = False continue data = line.rstrip("\r\n").split("\t") if len(data) > 12: internal_id = data[0] pvariant= '_'.join(data[1].split(':')) effect = data[5] organism = data[10] interaction = data[11] evidence = data[12] if organism.startswith("9606"): matches = re.finditer(regex, interaction) for matchNum, match in enumerate(matches, start=1): interactor = match.group(1) relationships.add((pvariant, interactor, "CURATED_AFFECTS_INTERACTION_WITH", effect, interaction, evidence, internal_id, "Intact-MutationDs")) # builder_utils.remove_directory(directory) return (relationships, header, output_file_name) if __name__ == '__main__': databases_directory = './databases' relationships, header, output_file_name = parser(databases_directory, download=True) # 新建CSV文件并写入表头 with open(os.path.join('./databases/MutationDs', output_file_name), 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(header) for item in relationships: writer.writerow(item) print("Data saved to {} successfully!".format('entities.csv'))
时间: 2024-01-15 14:03:47 浏览: 144
这段代码的作用是解析一个特定的文件,并将其中的数据存储到一个 CSV 文件中。现在我将逐步解释代码:
```
import os.path
import re
import yaml
import csv
from tasly import builder_utils
```
这段代码导入了四个 Python 模块 `os.path`, `re`, `yaml`, `csv`,以及一个自定义模块 `builder_utils`。
```
def parser(databases_directory, download=True):
relationships = set() # 加载yml文件
with open('./yml/mutationDsConfig.yml', 'r') as f:
config = yaml.safe_load(f)
header = config['header']
output_file_name = "mutation_curated_affects_interaction_with.csv"
regex = r":(\w+)\("
url = config['mutations_url']
directory = os.path.join(databases_directory, "MutationDs")
builder_utils.checkDirectory(directory)
file_name = os.path.join(directory, url.split('/')[-1])
if download:
builder_utils.downloadDB(url, directory)
with open(file_name, 'r', encoding='utf-8') as mf:
first = True
for line in mf:
if first:
first = False
continue
data = line.rstrip("\r\n").split("\t")
if len(data) > 12:
internal_id = data[0]
pvariant= '_'.join(data[1].split(':'))
effect = data[5]
organism = data[10]
interaction = data[11]
evidence = data[12]
if organism.startswith("9606"):
matches = re.finditer(regex, interaction)
for matchNum, match in enumerate(matches, start=1):
interactor = match.group(1)
relationships.add((pvariant, interactor, "CURATED_AFFECTS_INTERACTION_WITH", effect, interaction, evidence, internal_id, "Intact-MutationDs"))
# builder_utils.remove_directory(directory)
return (relationships, header, output_file_name)
```
这段代码定义了一个名为 `parser` 的函数,它接受一个参数 `databases_directory` 和一个可选参数 `download`,默认值为 `True`。该函数首先加载一个名为 `mutationDsConfig.yml` 的 YAML 文件,该文件包含一些配置信息,如 `header`、`mutations_url` 等。然后,函数使用 `os.path` 模块来构建一个目录名,该目录名为 `databases_directory` 加上 `MutationDs`。接着,函数使用 `builder_utils` 模块提供的 `checkDirectory` 函数检查该目录是否存在,如果不存在则创建该目录。然后,函数使用 `builder_utils` 模块提供的 `downloadDB` 函数下载一个名为 `mutations.tsv` 的文件,该文件存储了一些基因突变相关的数据。函数接着打开该文件,并读取其中的数据。函数使用 `re` 模块中的 `finditer` 函数找到所有匹配特定正则表达式的子字符串,然后将匹配的结果存储到一个名为 `relationships` 的集合中。最后,函数返回三个值:`relationships`、`header` 和 `output_file_name`。
```
if __name__ == '__main__':
databases_directory = './databases'
relationships, header, output_file_name = parser(databases_directory, download=True)
# 新建CSV文件并写入表头
with open(os.path.join('./databases/MutationDs', output_file_name), 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
for item in relationships:
writer.writerow(item)
print("Data saved to {} successfully!".format('entities.csv'))
```
这段代码检查当前模块是否为主模块,如果是,则执行下面的代码。首先,它定义了一个名为 `databases_directory` 的变量,该变量指定了存储数据的目录。然后,它调用 `parser` 函数,将 `databases_directory` 和 `download=True` 作为参数传递给该函数。函数返回三个值,这些值被分别赋值给三个变量 `relationships`、`header` 和 `output_file_name`。接着,代码使用 `csv` 模块创建一个新的 CSV 文件,并将 `header` 写入该文件。最后,代码遍历 `relationships` 集合中的元素,并将每个元素写入 CSV 文件中。
阅读全文