伪代码记录
参数
param_dict={
{"Asia1":"GCA_008797915.1_ASM879791v1_genomic"},
{"SAT3":"GCA_008798995.1_ASM879899v1_genomic"},
{"SAT2":"GCA_008799015.1_ASM879901v1_genomic"},
{"SAT1":"GCA_008799035.1_ASM879903v1_genomic"},
{"C":"GCA_008799055.1_ASM879905v1_genomic"},
{"A":"GCA_008799075.1_ASM879907v1_genomic"},
{"O":"GCF_002816555.1_ASM281655v1_genomic"},
}
DB_DIR=/data_raid1/bioinfo/app/cromwell/cromwell-executions/data/database
SampleID=S1
R1=${SampleID}_R1.fastq
R2=${SampleID}_R2.fastq
创建索引
cd ${DB_DIR}/FMDV_reference
for refSuffix in param_dict.items():
bowtie2-build ${refSuffix}.fna ${refSuffix}
krakenuniq
# {step: krakenuniq}
krakenuniq --report-file report.tsv --db ${DB_DIR}/kraken_uniq_database \
--threads ${T} --output report.kraken \
--paired ${Fq1} ${Fq2} --unclassified-out unclassifed.txt --classified-out classifed.txt
# {step:parse krakenuniq report}
# result file list:
# - fastqc_result.json
# - report.tsv
# - report.kraken
# - res.json
# - tax.json
# - result/${SampleID}_R${n}_${SeroType}_{ReadsNum}.fastq
溯源
for SeroType,refSuffix in #{Assemble Param data from Part1 result}:
# 进入一个单独的目录,该目录下全是同一 SeroType 不同样本的结果序列
mkdir temp_xx && cd temp_xx
for SampleID,ReadsNum in #{Assemble Param data from Part1 reuslt By Sample}:
Fq1=${Path2Sample}/${SampleID}_R1_${SeroType}_{ReadsNum}.fastq
Fq2=${Path2Sample}/${SampleID}_R2_${SeroType}_{ReadsNum}.fastq
bowtie2 -x ${DB_DIR}/FMDV_reference/${refSuffix} -U ${Fq1},${Fq2} -S paired.sam
samtools view -bS paired.sam -o paired.bam
samtools sort -o paired.sorted.bam paired.bam
# {samtools mpileup 命令已过时,改用 bcftools mpileup}
# {samtools mpileup -Ou -f ${DB_DIR}/FMDV_reference/${refSuffix}.fna paired.bam}
# 一致性序列,**该步骤会假设所有位点都是双倍体**
bcftools mpileup -Ou -f ${DB_DIR}/FMDV_reference/${refSuffix}.fna -o mpileup.vcf paired.sorted.bam
bcftools call -c -o all_site.vcf mpileup.vcf
# vcfutils.pl 属于 bcftools 软件的工具,将 vcf 转换成fastq
vcfutils.pl vcf2fq all_site.vcf > cns.fastq
# 将 fastq 转换成fasta
seqtk seq -aQ64 -q20 -n N cns.fastq > cns.fasta
# 将序列ID替换
sed -i 's/^>.*$/>${SampleID}_${SeroType}_{ReadsNum}/g' ${SampleID}_${SeroType}_{ReadsNum}_cns.fasta
cat ${DB_DIR}/FMDV_type/${SeroType}.fa temp_xx/*_cns.fasta > all.fa
mafft all.fa > fmdv_all.aln
trimal -in fmdv_all.aln -out fmdv.aln.trimmed
fasttree -gamma -nt -gtr -out fmdv.tree fmdv.aln.trimmed
# 最后将所有的 fmdv.tree 汇总