NCBI数据库中存放着大量得WGS测序数据,随着二代测序得发展,测序成本得降低以及各种微生物组计划得实施,越来越多得测序数据得以存储,各种组装级别得参考基因组(complete genome, chromosome, scaffold, contigs)提交在数据库中。
mNGS在病原微生物方向的发展,对病原微生物的鉴定依赖这些参考序列,但由于一些物种只有组装级别较低的参考基因组,就会导致假阳性的出现。在数据库整理及去除低组装参考基因组中的污染序列有利于鉴定过程中降低假阳性,尤其是真菌和原虫。
在这之前无意中看到一篇去除参考基因组中污染序列方法,即:
1.通过将参考基因组切分成100bp的片段,每条序列之间含有50bp的overlap,即序列移动的步长。
2.数据库的准备,不含有该参考基因组。
3.分类软件进行鉴定
4.bowtie进行比对
5.将分类软件鉴定到的序列和bowtie比对到序列用"N"代替原来的碱基。
6.将序列重更新拼接起来,此时可以该参考基因组即可认为是clean参考基因组。
因此根据该方法尝试写了处理序列的脚本:经过测试只能在本地用pycharm执行,在服务器上由于随机性,序列重新拼接并不是按照既定的顺序拼接,后续还需要进行调整。
经过测试,可以初步确定污染序列,NCBI验证通过该方式得到的污染在nt库在线比对,比对到其他物种
python">#_*_coding:UTF-8_*_
# import argparse
from collections import defaultdict
# 读取原始数据并生成去除污染序列短序列
def remove_seq():
origin_splited_dict = {}
# gene_id = []
# 读取原始序列文件
# with open(r"D:\My Projects\02_参考基因组处理\Penicillium_digitatum_Pd01-ZJU_100bp.fasta","r") as origin_file:
with open(r"D:\My Projects\02_参考基因组处理\Penicillium_digitatum_Pd01-ZJU_100bp_1.fasta","r") as origin_file:
for line in origin_file:
if line.startswith(">"):
id = line.rstrip("\n")
# gene_id.append(id.split("|")[0])
origin_splited_dict[id] = ""
else:
origin_splited_dict[id] = line.rstrip("\n")
# print(len(origin_splited_dict))
origin_file.close()
# 读取比对上其他参考基因组的序列
mapped_file_list = []
# with open(r"D:\My Projects\02_参考基因组处理\Penicillium_digitatum_Pd01-ZJU_100bp.mapped.fasta","r") as mapped_file:
with open(r"D:\My Projects\02_参考基因组处理\Penicillium_digitatum_Pd01-ZJU_100bp.mapped.fasta","r") as mapped_file:
for line in mapped_file:
if line.startswith(">"):
continue
else:
mapped_file_list.append(line.rstrip("\n"))
# print(len(mapped_file_list))
# print(gene_id)
# print(mapped_file_list)
mapped_file.close()
# 生成新的去除比对上其他参考基因组的序列
with open(r"D:\My Projects\02_参考基因组处理\test.fasta","w") as new_file:
for mapped_v in mapped_file_list:
# print(mapped_v)
for origin_splited_dict_k, origin_splited_v in list(origin_splited_dict.items()): # 字典在遍历时,不能进行修改,若需要修改,可先将其转化为列表
if mapped_v == origin_splited_v.upper():
# del origin_splited_dict[origin_splited_dict_k] # 删除
origin_splited_dict[origin_splited_dict_k] = "N" * 100 # 比对上的序列用N代替
for k in origin_splited_dict.keys():
new_file.write(k + "\n")
new_file.write(origin_splited_dict[k] + "\n")
new_file.close()
def assembly_seq():
"""
序列拼接
:return: None
"""
# join 拼接
# 读取去除污染序列文件,并转换为一键映射多个值的字典
assembly_dict = {}
with open(r"D:\My Projects\02_参考基因组处理\test.fasta","r") as file1:
for line in file1:
if line.startswith(">"):
id1 = line.rstrip("\n")
assembly_dict[id1] = ""
else:
assembly_dict[id1] = line.rstrip("\n")
# print(assembly_dict)
assembly_dict_list =list(assembly_dict.items())
# print(assembly_dict_list)
assembly_dict_new =defaultdict(list)
for k, v in assembly_dict_list:
id2 = k.split("|")[0]
assembly_dict_new[id2].append(v)
# print(assembly_dict_new)
# 遍历字典并进行序列拼接
with open(r"D:\My Projects\02_参考基因组处理\test3.fasta","w") as file2:
for k1 , v1 in assembly_dict_new.items():
#
file2.write(k1 + "\n")
file2.write(v1[0] + "\n")
for n in range(1, len(v1)):
seq = v1[n][50:100]
file2.write(seq)
if n % 2 == 0:
file2.write("\n")
if __name__ == "__main__":
fa_split_100bp = {}
assembly_name = []
# parse = argparse.ArgumentParser("Remove contaminated sequence from reference sequence")
# parse.add_argument("-i", "--inputfile",type=str, help="Input original split reference genome sequence")
# parse.add_argument("-I", "--inputfile",type=str, help="Input alignment of reference genome sequences")
# parse.add_argument("-o", "--outputfile",type=str, help="Output alignment of reference genome sequences")
# args = parse.parse_args()
# read_fasta()
remove_seq()
assembly_seq()