Methods

Long read libraries of each clone were mapped to 11109478133_WT_assembly as reference using minimap2 (Li 2021) with map-ont strategy. Variances were called using clair3 (Zheng et al. 2022) with ONT specific parameters. Resulting VCF files were further filtered for minimal depth of 5 and quality 10, normalized and merged for comparison using bcftools (Danecek et al. 2021). Annotation with overlapping/nearby features and type of mutation was conducted using snpEff [Cingolani et al. (2012).

cp -n RAW/annotation/11109478133_WT_assembly_annotation.gff3 wt_ref.gff3
awk '/^##FASTA/{p=1}; p==0{print}' < wt_ref.gff3 > wt_rlesef.fix.gff3
mkdir -p snpEff/data/Bsub_wt;
cp wt_ref.fix.gff3 snpEff/data/Bsub_wt/genes.gff
cp -n RAW/WT/11109478133_WT.assembly.fasta wt_ref.fasta
cp wt_ref.fasta snpEff/data/Bsub_wt/sequences.fa
samtools faidx wt_ref.fasta
printf "Bsub_wt.genome : Bacillus_subtilis_WT\n" > snpEff/snpEff.config 
docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff build -gff3 -v Bsub_wt -c snpEff.config -noCheckCds -noCheckProtein

for i in 1 2 3 4 5 6 7 8 10 11; do
    s=$(printf "%02d" $i);
    n=$(printf "Clone$s");
    mkdir -p "$n";
    # mapping the clone to WT
    minimap2 -t 12 -x map-ont -a --secondary=no wt_ref.fasta RAW/Clone$i/11109478133_Clone$i.rawdata.fastq.gz | samtools sort -t 12 -o $n/$n.mm2.wt.bam;
    samtools index $n/$n.mm2.wt.bam;
    # calling variants
    docker run --rm -v $PWD:$PWD -w $PWD -u $(id -u):$(id -g) hkubal/clair3:latest run_clair3.sh --bam_fn $n/$n.mm2.wt.bam --ref_fn wt_ref.fasta --threads 12 --model_path /opt/models/ont --platform ont --output $n --sample_name=Bsub$s --include_all_ctgs --no_phasing_for_fa --haploid_precise;
    # sniffles -i $n/$n.mm2.wt.bam --vcf $n/$n.mm2.snif.vcf --reference wt_ref.fasta --threads 12;
    # filter low quality
    bcftools view -i 'DP>=5 && QUAL>=10' $n/merge_output.vcf.gz -Oz -o $n/$n.clair3.flt.vcf.gz;
    bcftools index $n/$n.clair3.flt.vcf.gz;
    #bcftools norm -f wt_ref.fasta -Oz -o $n/$n.norm.vcf.gz $n/$n.clair3.flt.vcf.gz;
    #bcftools index $n/$n.norm.vcf.gz;
    
    docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff -v -c snpEff.config Bsub_wt ../$n/$n.clair3.flt.vcf.gz | bgzip > $n/$n.annot.vcf.gz;
   bcftools index $n/$n.annot.vcf.gz;
   #~/bin/FEATnotator/bin/FEATnotator.pl --input $n/$n.norm.vcf.gz --gff wt_ref.fix.gff3 --out_prefix $n/$n.feat.vcf.gz --snpAnnotation --refseq wt_ref.fasta --gff_presorted --vcf --plot --indelAnnotation
done;

bcftools merge -m all -Oz -o merged.vcf.gz Clone*/merge_output.vcf.gz
bcftools index merged.vcf.gz
docker run --rm -v $PWD:$PWD -w $PWD/snpEff staphb/snpeff snpEff -v -c snpEff.config Bsub_wt ../merged.vcf.gz | bgzip > merged.annot.vcf.gz
bcftools index merged.annot.vcf.gz
zcat merged.annot.vcf.gz > merged.annot.vcf

Results

The merged.annot.vcf can be opened in office spreadsheets as a table and shows which mutations were found in which clone and what genes might be affected (note that also upstream and downstream genes are mentioned and the format is not very nice). The merged.annot.vcf.gz can also be displayed in IGV (e.g. online https://igv.org/app/ after loading the WT genome ref_wt.fasta & ref_wt.fasta.fai. After loading the genome, tracks can be added e.g. ref_wt.fix.gff3 for all gene annotations, merged.annot.vcf.gz & merged.annot.vcf.gz.csi for all variants or just the variants of one clone with Clone01/Clone01.clair3.flt.vcf.gz & Clone01/Clone01.clair3.flt.vcf.gz.csi.

library(vcfR)
library(tidyverse)
# library(pheatmap)
# library(UpSetR)

vcf <- read.vcfR("merged.annot.vcf.gz", verbose=F)

# extract fixed fields + INFO + genotypes
fix <- as_tibble(getFIX(vcf))
gt  <- extract.gt(vcf) %>% as_tibble(rownames="VARIANT_ID")

# INFO: parse annotations (FEATnotator often uses fields like GENE, EFFECT, PRODUCT)
info <- vcf@fix[, "INFO"] %>%
  as.character() %>%
  tibble(INFO=.) %>%
  separate(INFO, into=c("GENE","EFFECT"), sep=";", extra="merge", fill="right")

# ---------- LONG FORMAT ----------
# combine variant coordinates + annotation + genotype matrix
df <- bind_cols(fix, info, gt)

# melt to long: one row per sample per variant
long <- df %>%
  pivot_longer(cols=starts_with("Bsub"), names_to="Sample", values_to="GT") %>%
  filter(GT %in% c("0/1","1/1","1"))   # keep only non-reference calls

# clean up effect field (FEATnotator may put multiple; take first)
long <- long %>%
  mutate(EFFECT = str_split_fixed(EFFECT, ",", 2)[,1],
         GENE   = ifelse(GENE=="", NA, GENE))
long %>%
  mutate(POS = as.numeric(POS)) %>%
  ggplot(aes(x=POS, y=Sample, color=EFFECT)) +
  geom_point(alpha=0.7) +
  facet_wrap(~CHROM, scales="free_x", ncol=1) +
  theme_bw() +
   theme(legend.position='none') +
  labs(title="Variant positions per clone", x="Genomic position", y="Sample")

Cingolani, Pablo, Adrian Platts, Le Lily Wang, et al. 2012. A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.” Fly 6 (2): 80–92. https://doi.org/10.4161/fly.19695.
Danecek, Petr, James K Bonfield, Jennifer Liddle, et al. 2021. Twelve years of SAMtools and BCFtools.” GigaScience 10 (2). https://doi.org/10.1093/gigascience/giab008.
Li, Heng. 2021. New strategies to improve minimap2 alignment accuracy.” Bioinformatics 37 (23): 4572–74. https://doi.org/10.1093/bioinformatics/btab705.
Zheng, Zhenxian, Shumin Li, Junhao Su, Amy Wing-Sze Leung, Tak-Wah Lam, and Ruibang Luo. 2022. Symphonizing pileup and full-alignment for deep learning-based long-read variant calling.” Nat. Comput. Sci. 2 (December): 797–803. https://doi.org/10.1038/s43588-022-00387-x.

  1. Bioinformatics Core Facility, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany, ↩︎

  2. Bioinformatics Core Facility, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎

  3. Faculty of Chemistry and Unit for Chemical Biology, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎

  4. Faculty of Chemistry and Unit for Chemical Biology, Center for Synthetic Microbiology (SYNMIKRO), Philipps-Universität Marburg, Germany↩︎