bio: additional examples

The following script is a test file that we use to verify the various functionalities of bio. To run all tests execute:

bio test

The file lists the various usage modes of bio and is the most up to date demonstration of the usage

Code on GitHub:

https://github.com/ialbert/bio/blob/master/biorun/data/usage.sh

The location for the data:

https://github.com/ialbert/bio/tree/master/biorun/data

Usage

#
# This script is used to generate Python tests.
#
# The output generated by each test can be seen at:
#
# https://github.com/ialbert/bio/tree/master/test/data
#

# Stop on errors.
set -uex

# Selecting by gene id
bio fasta genomes.gb --type gene --id N --end 10 > fasta_ids.fa

# Match with regular expression
cat genomes.gb | bio fasta -m glyco -end 10 > fasta_match.fa

# Last ten sequences of all entries.
bio fasta genomes.gb -end 10 --features > fasta_all1.fa

# Should produce the same output as above
cat genomes.gb | bio fasta --end  10 --features > fasta_all2.fa

# No muterence between outputs.
diff fasta_all1.fa fasta_all2.fa > nodiff.txt

# Generate a JSON out[put
cat genomes.gb | bio json > genomes.json

# Last ten sequences of all entries.
cat genomes.json | bio fasta -end 10  --features > fasta_all3.fa

# No muterence between outputs.
diff fasta_all1.fa fasta_all3.fa > nodiff.txt

# Renaming with patterns
bio fasta genomes.gb --end 100  --rename {isolate} > fasta_rename1.fa

# Renaming with a file
bio fasta genomes.gb --end 100  --rename alias.txt > fasta_alias1.fa

# Outputs overlapping features in FASTA
cat genomes.gb | bio fasta --features --olap 29514 -e 10 --type CDS > fasta_olap1.fa

# Outputs overlapping features as GFF
cat genomes.gb | bio gff --olap 29514 > gff_olap1.gff

# Compute sizes
cat genomes.gb | bio table --type CDS --olap 3778,8388,8987 --fields id,gene,size  > table_1.txt

# Generate default table.
cat genomes.gb | bio table > table_2.txt

# Generate features only.
bio fasta genomes.gb --end 10 --type CDS > fasta_cds.fa

# Translate the features.
bio fasta genomes.gb --type CDS --translate > fasta_translate.fa

# Translate in a frame
bio fasta GATTACA --frame -3 --translate > fasta_frame.fa

# Extract the proteins.
bio fasta genomes.gb --protein > fasta_protein.fa

# Start codons
cat fasta_cds.fa | bio fasta -e -3 > fasta_start.fa

# Last codons
cat fasta_cds.fa | bio fasta -s -3 > fasta_stop.fa

# Default alignment.
bio align GATTACA GATCA > align_default.txt

# Default alignment.
bio align GATTACA GATCA --global > align_global.txt

# Default alignment.
bio align GATTACA GATCA --local > align_local.txt

# Running variants.
bio align GATTACA GATCA --vcf > align_default.vcf

# Running variants.
bio align GATTACA GATCA --diff  > align_default.diff

# Running on FASTA files.
bio align align_input.fa --vcf > align_input.vcf

# Creating the pileup output.
bio fasta GATTACA GTTAACA GTTTATA GTTT > fasta_multi.fa

# Creating the pileup output.
bio align GATTACA GTTAACA GTTTATA GTTT --pile > align_pile1.txt

# Format to pairwise
bio format mafft.fa > format_mafft1.txt

# Format to VCF
bio format mafft.fa --vcf > format_mafft1.vcf

# Format to differences
bio format mafft.fa --diff > format_mafft1.diff.txt

# Select S proteins
bio fasta --gene S --protein  genomes.gb > fasta_s.fa

# Align proteins.
bio align fasta_s.fa > align_s.txt

# Alignment as a table.
bio align fasta_s.fa --table > align_s.tsv

# Align as variants.
bio align fasta_s.fa --vcf > align_s.vcf

# Convert genbank files to GFF
bio gff genomes.gb > gff_all.gff

# Convert genbank files to GFF
bio gff genomes.gb --type CDS > gff_CDS.gff

# Slice the GFF file.
bio gff -s 300 -e 10k genomes.gb > gff_slice.gff

# Taxonomy listing.
bio taxon 117565 -d 5 > taxon1.txt

# Taxonomy lineage. from file TODO
# bio taxon genomes.gb --lineage > lineage.txt

# Getting some metadata for taxon 11138 (Murine hepatitis virus)
bio meta 11138 -H > meta.txt

# Define exact SO term
bio explain exon > explain_exon.txt

# Define exact SO term
bio explain food vacuole > explain_food.txt

# Search for terms
bio explain neutral > explain_neutral.txt

# Running comm.py
bio comm file1.txt file2.txt > comm0.txt
bio comm -1 file1.txt file2.txt > comm1.txt
bio comm -2 file1.txt file2.txt > comm2.txt

# Running uniq.py
cat file1.txt file2.txt | bio  uniq > uniq0.txt
cat file1.txt file2.txt | bio  uniq -f 2 > uniq1.txt
cat file1.txt file2.txt | bio  uniq -c -f 2  > uniq3.txt

# Get data from SRA (can be spotty)
bio search SRR1972976 > search_srr.json

# Get bioproject information
bio search PRJNA661333 > search_prjn.json

# Get assembly information
bio search GCF_000003085 > search_assembly.json

# Search mygene info
bio search symbol:HAD --species 1316788 > search_mygene.json

# Access a transcript from ensembl.
bio fetch ENST00000288602  > fetch_enst.txt

# Get a GFF from NCBI
bio fetch NC_045512 --format gff > fetch_gff.gff

# Get a protein.
bio fetch YP_009724390 > fetch_prot.fa

# Get data from NCBI
bio fetch NC_045512 MN996532 > genomes.gb