Long read libraries of each clone were mapped to
11109478133_WT_assembly
as reference using
minimap2
(Li 2021) with
map-ont
strategy. Variances were called using
clair3
(Zheng et al. 2022)
with ONT specific parameters. Resulting VCF files were further filtered
for minimal depth of 5 and quality 10, normalized and merged for
comparison using bcftools
(Danecek
et al. 2021). Annotation with overlapping/nearby features and
type of mutation was conducted using snpEff
[Cingolani et al. (2012).
cp -n RAW/annotation/11109478133_WT_assembly_annotation.gff3 wt_ref.gff3
awk '/^##FASTA/{p=1}; p==0{print}' < wt_ref.gff3 > wt_rlesef.fix.gff3
mkdir -p snpEff/data/Bsub_wt;
cp wt_ref.fix.gff3 snpEff/data/Bsub_wt/genes.gff
cp -n RAW/WT/11109478133_WT.assembly.fasta wt_ref.fasta
cp wt_ref.fasta snpEff/data/Bsub_wt/sequences.fa
samtools faidx wt_ref.fasta
printf "Bsub_wt.genome : Bacillus_subtilis_WT\n" > snpEff/snpEff.config
docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff build -gff3 -v Bsub_wt -c snpEff.config -noCheckCds -noCheckProtein
for i in 1 2 3 4 5 6 7 8 10 11; do
s=$(printf "%02d" $i);
n=$(printf "Clone$s");
mkdir -p "$n";
# mapping the clone to WT
minimap2 -t 12 -x map-ont -a --secondary=no wt_ref.fasta RAW/Clone$i/11109478133_Clone$i.rawdata.fastq.gz | samtools sort -t 12 -o $n/$n.mm2.wt.bam;
samtools index $n/$n.mm2.wt.bam;
# calling variants
docker run --rm -v $PWD:$PWD -w $PWD -u $(id -u):$(id -g) hkubal/clair3:latest run_clair3.sh --bam_fn $n/$n.mm2.wt.bam --ref_fn wt_ref.fasta --threads 12 --model_path /opt/models/ont --platform ont --output $n --sample_name=Bsub$s --include_all_ctgs --no_phasing_for_fa --haploid_precise;
# sniffles -i $n/$n.mm2.wt.bam --vcf $n/$n.mm2.snif.vcf --reference wt_ref.fasta --threads 12;
# filter low quality
bcftools view -i 'DP>=5 && QUAL>=10' $n/merge_output.vcf.gz -Oz -o $n/$n.clair3.flt.vcf.gz;
bcftools index $n/$n.clair3.flt.vcf.gz;
#bcftools norm -f wt_ref.fasta -Oz -o $n/$n.norm.vcf.gz $n/$n.clair3.flt.vcf.gz;
#bcftools index $n/$n.norm.vcf.gz;
docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff -v -c snpEff.config Bsub_wt ../$n/$n.clair3.flt.vcf.gz | bgzip > $n/$n.annot.vcf.gz;
bcftools index $n/$n.annot.vcf.gz;
#~/bin/FEATnotator/bin/FEATnotator.pl --input $n/$n.norm.vcf.gz --gff wt_ref.fix.gff3 --out_prefix $n/$n.feat.vcf.gz --snpAnnotation --refseq wt_ref.fasta --gff_presorted --vcf --plot --indelAnnotation
done;
bcftools merge -m all -Oz -o merged.vcf.gz Clone*/merge_output.vcf.gz
bcftools index merged.vcf.gz
docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff -v -c snpEff.config Bsub_wt ../merged.vcf.gz | bgzip > merged.annot.vcf.gz
bcftools index merged.annot.vcf.gz
zcat merged.annot.vcf.gz > merged.annot.vcf
The merged.annot.vcf
can be opened in office
spreadsheets as a table and shows which mutations were found in which
clone and what genes might be affected (note that also upstream and
downstream genes are mentioned and the format is not very nice). The
merged.annot.vcf.gz
can also be displayed in IGV
(e.g. online https://igv.org/app/
after loading the WT genome ref_wt.fasta
&
ref_wt.fasta.fai
. After loading the genome, tracks can be
added e.g. ref_wt.fix.gff3
for all gene annotations,
merged.annot.vcf.gz
&
merged.annot.vcf.gz.csi
for all variants or just the
variants of one clone with
Clone01/Clone01.clair3.flt.vcf.gz
&
Clone01/Clone01.clair3.flt.vcf.gz.csi
.
library(vcfR)
library(tidyverse)
# library(pheatmap)
# library(UpSetR)
vcf <- read.vcfR("merged.annot.vcf.gz", verbose=F)
# extract fixed fields + INFO + genotypes
fix <- as_tibble(getFIX(vcf))
gt <- extract.gt(vcf) %>% as_tibble(rownames="VARIANT_ID")
# INFO: parse annotations (FEATnotator often uses fields like GENE, EFFECT, PRODUCT)
info <- vcf@fix[, "INFO"] %>%
as.character() %>%
tibble(INFO=.) %>%
separate(INFO, into=c("GENE","EFFECT"), sep=";", extra="merge", fill="right")
# ---------- LONG FORMAT ----------
# combine variant coordinates + annotation + genotype matrix
df <- bind_cols(fix, info, gt)
# melt to long: one row per sample per variant
long <- df %>%
pivot_longer(cols=starts_with("Bsub"), names_to="Sample", values_to="GT") %>%
filter(GT %in% c("0/1","1/1","1")) # keep only non-reference calls
# clean up effect field (FEATnotator may put multiple; take first)
long <- long %>%
mutate(EFFECT = str_split_fixed(EFFECT, ",", 2)[,1],
GENE = ifelse(GENE=="", NA, GENE))
long %>%
mutate(POS = as.numeric(POS)) %>%
ggplot(aes(x=POS, y=Sample, color=EFFECT)) +
geom_point(alpha=0.7) +
facet_wrap(~CHROM, scales="free_x", ncol=1) +
theme_bw() +
theme(legend.position='none') +
labs(title="Variant positions per clone", x="Genomic position", y="Sample")
Bioinformatics Core Facility, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany, clemens@thoelken.org↩︎
Bioinformatics Core Facility, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎
Faculty of Chemistry and Unit for Chemical Biology, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎
Faculty of Chemistry and Unit for Chemical Biology, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎