GATK 變異分析
下面的來自網絡未驗證
GATK 變異分析對於大數據樣本可能會比較慢,因此可以按照染色體拆分後進行多線程並行計算。
下面是我寫的一個python多線程腳本,僅供參考,拙劣之處敬請指正。
#!/usr/bin/python3import _threadimport osimport threadingimport timemuthreads=[]bam_file="a.mkdup.bam"out_file_prefix="flower" chr_list=["CHR01","CHR02","CHR03","CHR04","CHR05","CHR06","CHR07","CHR08","CHR09","CHR10","CHR11","CHR12","CHR13"]for chr in chr_list: threads_comonder_name= "gatk HaplotypeCaller --intervals " + chr +" -R /mnt/j/BSA/02-read-align/Tifrunner2.fasta -I " + bam_file + " -ERC GVCF -O "+ out_file_prefix +"-"+chr+".erc.g.vcf" muthreads.append(threads_comonder_name)exitFlag = 0class myThread (threading.Thread): def __init__(self, threadID, name, counter, comander): threading.Thread.__init__(self) self.threadID = threadID self.name = name self.counter = counter self.comander = comander def run(self): print ("開始線程:" + self.name) print_time(self.name, self.counter, 5, self.comander) print ("退出線程:" + self.name)def print_time(threadName, delay, counter,comander): # while counter: if exitFlag: threadName.exit() time.sleep(delay) print(comander) os.system(comander)#調用操作系統命令行處理數據 # counter -= 1# 創建新線程threadlist=[]for i, threadsnu in enumerate(muthreads[0:11]): print(i) print(threadsnu) threadsnew=myThread(1, "Thread-" + str(i), 2, threadsnu) threadlist.append(threadsnew)# 開啟新線程for threads in threadlist: threads.start()for threads in threadlist: threads.join()print ("運行結束退出主線程")
下面的來自網絡未驗證多條染色體的同樣本的vcf文件合並
# for i in {1..22} X Y ;do echo "-I final_chr$i.vcf" '\';done# for i in {10..19} {1..9} M X Y ;do echo "-I final_chr$i.vcf" '\';donemodule load java/1.8.0_91GATK=/home/jianmingzeng/biosoft/GATK/gatk-4.0.3.0/gatk$GATK GatherVcfs \-I final_chr1.vcf \-I final_chr2.vcf \-I final_chr3.vcf \-I final_chr4.vcf \-I final_chr5.vcf \-I final_chr6.vcf \-I final_chr7.vcf \-I final_chr8.vcf \-I final_chr9.vcf \-I final_chr10.vcf \-I final_chr11.vcf \-I final_chr12.vcf \-I final_chr13.vcf \-I final_chr14.vcf \-I final_chr15.vcf \-I final_chr16.vcf \-I final_chr17.vcf \-I final_chr18.vcf \-I final_chr19.vcf \-I final_chr20.vcf \-I final_chr21.vcf \-I final_chr22.vcf \-I final_chrX.vcf \-I final_chrY.vcf \-O merge.vcf
合並的時候需要注意,vcf文件的順序跟每個vcf文件裡面頭文件順序是相同的。
以上就是python實現GATK多線程加速示例的詳細內容,更多關於python GATK多線程加速的資料請關注軟件開發網其它相關文章!