bio: additional examples
The following script is a test file that we use to verify the various functionalities of bio
. To run all tests execute:
bio test
The file lists the various usage modes of bio
and is the most up to date demonstration of the usage
Code on GitHub:
The location for the data:
Usage
#
# This script is used to generate Python tests.
#
# The output generated by each test can be seen at:
#
# https://github.com/ialbert/bio/tree/master/test/data
#
# Stop on errors.
set -uex
# Selecting by gene id
bio fasta genomes.gb --type gene --id N --end 10 > fasta_ids.fa
# Match with regular expression
cat genomes.gb | bio fasta -m glyco -end 10 > fasta_match.fa
# Last ten sequences of all entries.
bio fasta genomes.gb -end 10 --features > fasta_all1.fa
# Should produce the same output as above
cat genomes.gb | bio fasta --end 10 --features > fasta_all2.fa
# No muterence between outputs.
diff fasta_all1.fa fasta_all2.fa > nodiff.txt
# Generate a JSON out[put
cat genomes.gb | bio json > genomes.json
# Last ten sequences of all entries.
cat genomes.json | bio fasta -end 10 --features > fasta_all3.fa
# No muterence between outputs.
diff fasta_all1.fa fasta_all3.fa > nodiff.txt
# Renaming with patterns
bio fasta genomes.gb --end 100 --rename {isolate} > fasta_rename1.fa
# Renaming with a file
bio fasta genomes.gb --end 100 --rename alias.txt > fasta_alias1.fa
# Outputs overlapping features in FASTA
cat genomes.gb | bio fasta --features --olap 29514 -e 10 --type CDS > fasta_olap1.fa
# Outputs overlapping features as GFF
cat genomes.gb | bio gff --olap 29514 > gff_olap1.gff
# Compute sizes
cat genomes.gb | bio table --type CDS --olap 3778,8388,8987 --fields id,gene,size > table_1.txt
# Generate default table.
cat genomes.gb | bio table > table_2.txt
# Generate features only.
bio fasta genomes.gb --end 10 --type CDS > fasta_cds.fa
# Translate the features.
bio fasta genomes.gb --type CDS --translate > fasta_translate.fa
# Translate in a frame
bio fasta GATTACA --frame -3 --translate > fasta_frame.fa
# Extract the proteins.
bio fasta genomes.gb --protein > fasta_protein.fa
# Start codons
cat fasta_cds.fa | bio fasta -e -3 > fasta_start.fa
# Last codons
cat fasta_cds.fa | bio fasta -s -3 > fasta_stop.fa
# Default alignment.
bio align GATTACA GATCA > align_default.txt
# Default alignment.
bio align GATTACA GATCA --global > align_global.txt
# Default alignment.
bio align GATTACA GATCA --local > align_local.txt
# Running variants.
bio align GATTACA GATCA --vcf > align_default.vcf
# Running variants.
bio align GATTACA GATCA --diff > align_default.diff
# Running on FASTA files.
bio align align_input.fa --vcf > align_input.vcf
# Creating the pileup output.
bio fasta GATTACA GTTAACA GTTTATA GTTT > fasta_multi.fa
# Creating the pileup output.
bio align GATTACA GTTAACA GTTTATA GTTT --pile > align_pile1.txt
# Format to pairwise
bio format mafft.fa > format_mafft1.txt
# Format to VCF
bio format mafft.fa --vcf > format_mafft1.vcf
# Format to differences
bio format mafft.fa --diff > format_mafft1.diff.txt
# Select S proteins
bio fasta --gene S --protein genomes.gb > fasta_s.fa
# Align proteins.
bio align fasta_s.fa > align_s.txt
# Alignment as a table.
bio align fasta_s.fa --table > align_s.tsv
# Align as variants.
bio align fasta_s.fa --vcf > align_s.vcf
# Convert genbank files to GFF
bio gff genomes.gb > gff_all.gff
# Convert genbank files to GFF
bio gff genomes.gb --type CDS > gff_CDS.gff
# Slice the GFF file.
bio gff -s 300 -e 10k genomes.gb > gff_slice.gff
# Taxonomy listing.
bio taxon 117565 -d 5 > taxon1.txt
# Taxonomy lineage. from file TODO
# bio taxon genomes.gb --lineage > lineage.txt
# Getting some metadata for taxon 11138 (Murine hepatitis virus)
bio meta 11138 -H > meta.txt
# Define exact SO term
bio explain exon > explain_exon.txt
# Define exact SO term
bio explain food vacuole > explain_food.txt
# Search for terms
bio explain neutral > explain_neutral.txt
# Running comm.py
bio comm file1.txt file2.txt > comm0.txt
bio comm -1 file1.txt file2.txt > comm1.txt
bio comm -2 file1.txt file2.txt > comm2.txt
# Running uniq.py
cat file1.txt file2.txt | bio uniq > uniq0.txt
cat file1.txt file2.txt | bio uniq -f 2 > uniq1.txt
cat file1.txt file2.txt | bio uniq -c -f 2 > uniq3.txt
# Get data from SRA (can be spotty)
bio search SRR1972976 > search_srr.json
# Get bioproject information
bio search PRJNA661333 > search_prjn.json
# Get assembly information
bio search GCF_000003085 > search_assembly.json
# Search mygene info
bio search symbol:HAD --species 1316788 > search_mygene.json
# Access a transcript from ensembl.
bio fetch ENST00000288602 > fetch_enst.txt
# Get a GFF from NCBI
bio fetch NC_045512 --format gff > fetch_gff.gff
# Get a protein.
bio fetch YP_009724390 > fetch_prot.fa
# Get data from NCBI
bio fetch NC_045512 MN996532 > genomes.gb