#!/usr/bin/perl
## BacMet-Scan
$app_title = "BacMet-Scan - A toolbox for searching the Biocide and Metal Resistance Database";
$app_author = "Johan Bengtsson-Palme & Chandan Pal, University of Gothenburg";
$app_version = "1.0";
$app_message = "";
# ----------------------------------------------------------------- #
# License information
$license =
" BacMet-Scan - A toolbox for searching the Biocide and Metal Resistance Database
Copyright (C) 2013-2014 Johan Bengtsson-Palme & Chandan Pal
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of University of Gothenburg, nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
";
## BUGS:
$bugs = "Added feature:\
- Initial relase\
Known bugs in this version ($app_version):\
- None\
";
## OPTIONS:
$usage = "\
-i : the path to the input file containing non-paired sequences to scan\
-1 : if using paired-end input, the path to the input file containing the first reads to scan\
-2 : if using paired-end input, the path to the input file containing the second reads to scan\
-o : the base name of the BacMet-Scan output files (if not specified, BacMet-Scan will write to stdout)\
-d : the database to use, either EXP, PRE, or a path to a specific database directory, default 'PRE'\
Software options:\
=================\
-blast : uses BLAST for searching BacMet (default)\
-blastall : uses the old BLAST engine for searching BacMet\
-blat : uses BLAT for searching BacMet\
-pblat : uses Parallel BLAT (pblat) for searching BacMet\
-vmatch : uses VMATCH for searching BacMet\
-fixst : uses FIXST for searching BacMet\
-cpu : number of CPUs to use (if possible), default = 1\
-r : use this file (output from the tool above that generated the file)\
for input instead of performing the actual search\
-protein : input sequence file is in protein format (nucleotides are assumed by default)\
Filtering options:\
=================\
-e : E-value cutoff, default = 1\
-l : Length cutoff, default = 30\
-p : Percent identity cutoff, default = 90\
-s : Score per length cutoff, not used by default (tool dependent!)\
Output options:\
=================\
-table : outputs a BacMet-Scan report in table format (default)\
-report : outputs the BLAST/BLAT/VMATCH/FIXST report\
-counts : outputs a list of counts for each gene\
-matrix : outputs a list of counts for each gene, without the gene names, suitable for matrix format\
-toplist : outputs a list of encountered genes, sorted by abundance\
-all : outputs all possible BacMet-Scan output\
-columns : selects what columns to output to the BacMet-Scan output table\
comma-separated list with the following possible items:\
query,subject,gene,description,organism,location,compound,identity,length,evalue,score\
default is: query,subject,gene,identity,length\
can also be specified as 'all' to get all columns\
-v : be verbose (print messages during the BacMet-Scan process)\
-h : displays short usage information\
-help : displays this help message\
-bugs : displays the bug fixes and known bugs in this version\
-license : displays licensing information\
";
$bindir = $0;
$bindir =~ s/BacMet-Scan$//;
$db = "PRE";
$input = "";
$input1 = "";
$input2 = "";
$output = "";
$cpu = 1;
$report = "";
$software = "blast";
$protein = 0;
$E = 1;
$L = 30;
$P = 90;
$S = -1000000;
$out_table = 1;
$out_report = 0;
$out_counts = 0;
$out_matrix = 0;
$out_top = 0;
$columns = "query,subject,gene,identity,length";
$verbose = 0;
for ($i = 0; $i <= scalar(@ARGV); $i++) { # Goes through the list of arguments
$arg = @ARGV[$i]; # Stores the current argument in $arg
if (substr($arg, 0, 2) eq "--") {
$arg = substr($arg,1);
}
if ($arg eq "-i") { # Read input file from -i flag
$i++;
$input = @ARGV[$i];
}
if ($arg eq "-1") { # Read input files from -1 flag
$i++;
$input1 = @ARGV[$i];
}
if ($arg eq "-2") { # Read input files from -2 flag
$i++;
$input2 = @ARGV[$i];
}
if ($arg eq "-o") { # Read output file
$i++;
$output = @ARGV[$i];
}
if ($arg eq "-r") { # Read report file
$i++;
$report = @ARGV[$i];
}
if ($arg eq "-e") { # Read E-value cutoff
$i++;
$E = @ARGV[$i];
}
if ($arg eq "-l") { # Read minimal length
$i++;
$L = @ARGV[$i];
}
if ($arg eq "-p") { # Read identity cutoff
$i++;
$P = @ARGV[$i];
}
if ($arg eq "-s") { # Read score-per-length cutoff
$i++;
$S = @ARGV[$i];
}
if ($arg eq "-d") { # Read database
$i++;
$db = @ARGV[$i];
}
if ($arg eq "-cpu") { # Set number of CPUs
$i++;
$cpu = @ARGV[$i];
}
if ($arg eq "-blast") { # Set software to use
$software = "blast";
}
if ($arg eq "-blastall") { # Set software to use
$software = "blastall";
}
if ($arg eq "-blat") { # Set software to use
$software = "blat";
}
if ($arg eq "-pblat") { # Set software to use
$software = "pblat";
}
if ($arg eq "-vmatch") { # Set software to use
$software = "vmatch";
}
if ($arg eq "-fixst") { # Set software to use
$software = "fixst";
}
if ($arg eq "-protein") { # Set input sequence type
$protein = 1;
}
if ($arg eq "-all") { # Set output type
$out_report = 1;
$out_table = 1;
$out_matrix = 1;
$out_counts = 1;
$out_top = 1;
}
if ($arg eq "-report") { # Set output type
$out_report = 1;
}
if ($arg eq "-table") { # Set output type
$out_table = 1;
}
if ($arg eq "-counts") { # Set output type
$out_counts = 1;
}
if ($arg eq "-matrix") { # Set output type
$out_matrix = 1;
}
if ($arg eq "-toplist") { # Set output type
$out_top = 1;
}
if ($arg eq "-columns") { # Set output columns
$i++;
$columns = @ARGV[$i];
if (lc($columns) eq "all") {
$columns = "query,subject,gene,description,organism,location,compound,identity,length,evalue,score";
}
}
if ($arg eq "-v") { # Be verbose?
$verbose = 1;
}
## If "-h" is among the options, output short usage data and options
if ($arg eq "-h") {
print "Usage: BacMet-Scan -i -o \nOptions:$usage";
print "-----------------------------------------------------------------\n";
exit; # Exit
}
## If "-help" is among the options, output usage data and all options
if ($arg eq "-help") {
print "Usage: BacMet-Scan -i -o \nOptions:$usage";
print "-----------------------------------------------------------------\n";
exit; # Exit
}
## If "-bugs" is among the options, output bugs and features information
if ($arg eq "-bugs") {
print "$bugs\n";
exit; # Exit
}
## If "-license" is among the options, output license information
if ($arg eq "-license") {
print "$license\n";
exit; # Exit
}
}
if ($verbose == 1) {
## Print title message
print STDERR "$app_title\nby $app_author\nVersion: $app_version\n$app_message";
print STDERR "-----------------------------------------------------------------\n";
}
@columns = split(',',$columns);
## Check for input
if (($input eq "") && ($input1 eq "") && ($report eq "")) {
print STDERR "ERROR! No input file provided! Use '-h' for usage options!\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
## If database is EXP or PRE auto-detect where it is
if ($db eq "PRE") {
$db_temp = `ls $bindir\BacMet/BacMet_PRE*fasta 2>/dev/null`;
if ($db_temp ne "") {
$db = "$bindir\BacMet/BacMet_PRE";
}
$db_temp = `ls $bindir\BacMet_PRE*fasta 2>/dev/null 2>/dev/null`;
if ($db_temp ne "") {
$db = "$bindir\BacMet_PRE";
}
$db_temp = `ls ./BacMet_PRE*fasta 2>/dev/null 2>/dev/null`;
if ($db_temp ne "") {
$db = "./BacMet_PRE";
}
} else {
if ($db eq "EXP") {
$db_temp = `ls $bindir\BacMet/BacMet_EXP*fasta 2>/dev/null`;
if ($db_temp ne "") {
$db = "$bindir\BacMet/BacMet_EXP";
}
$db_temp = `ls $bindir\BacMet_EXP*fasta 2>/dev/null`;
if ($db_temp ne "") {
$db = "$bindir\BacMet_EXP";
}
$db_temp = `ls ./BacMet_EXP*fasta 2>/dev/null`;
if ($db_temp ne "") {
$db = "./BacMet_EXP";
}
}
}
## Check for database
chomp($errormsg = `ls $db* 2>&1 1>/dev/null`); # Get the error msg when looking for the profile database
if (substr($errormsg,0,4) eq "ls: ") { # If the error message begins with "ls: ", then show an error message and exit
print STDERR "ERROR! The BacMet database could not be found.\
Expected to find it in $db\
Consult the manual for installation instructions.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
## Check for desired software
if ($software eq "blast") {
if ($protein == 0) {
chomp($path = `which blastx`); # Get the path for blastx
} else {
chomp($path = `which blastp`); # Get the path for blastp
}
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate BLAST binaries! Make sure that BLAST+ is installed, and try again.\
To use the old BLAST engine, use the -blastall option.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($software eq "blastall") {
chomp($path = `which blastall`); # Get the path for blastall
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate BLAST binaries! Make sure that BLAST is installed, and try again.\
To use the new BLAST+ engine, use the -blast option.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($software eq "blat") {
chomp($path = `which blat`); # Get the path for blat
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate BLAT binaries! Make sure that BLAT is installed, and try again.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($software eq "pblat") {
chomp($path = `which pblat`); # Get the path for pblat
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate pBLAT binaries! Make sure that pBLAT is installed, and try again.\
To use the non-parallelized version of BLAT, use the -blat option.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($software eq "vmatch") {
chomp($path = `which vmatch`); # Get the path for vmatch
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate VMATCH binaries! Make sure that VMATCH is installed, and try again.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($software eq "fixst") {
chomp($path = `which fixst`); # Get the path for fixst
if ($path eq "") { # If the path is empty, then show an error message and exit
print STDERR "ERROR! Could not locate FIXST binaries! Make sure that FIXST is installed, and try again.\n";
print STDERR "-----------------------------------------------------------------\n";
exit;
}
}
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : BacMet-Scan started...\n";
}
if ($input ne "") {
push(@inputFiles,$input);
}
if ($input1 ne "") {
push(@inputFiles,$input1);
}
if ($input2 ne "") {
push(@inputFiles,$input2);
}
if ($report eq "") { # If no report has been input, do the searching against BacMet
`rm $output.bacmet.report 2> /dev/null`;
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : Preparing BacMet database for $software...\n";
}
if ($software eq "blast") {
`makeblastdb -in $db*fasta -title "BacMet-Scan database" -dbtype 'prot' -out $db `;
}
if ($software eq "blastall") {
`formatdb -i $db*fasta -t "BacMet-Scan database" -o F -p T -n $db`;
}
if ($software eq "blat") {
$db = "$db*fasta";
}
if ($software eq "pblat") {
$db = "$db*fasta";
}
if ($software eq "vmatch") {
`mkvtree -db $db*fasta -protein -indexname $db -pl -allout`;
}
if ($software eq "fixst") {
`fixst --index -i $db*fasta -d $db -f prot`;
}
foreach $input (@inputFiles) {
$runsoftware = 1;
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : Searching $input against BacMet using $software...\n";
}
if ($software eq "blast") {
if ($protein == 0) {
`blastx -db $db -query $input -out $output.bacmet.report.temp -evalue $E -seg no -outfmt 6 -num_threads $cpu`;
} else {
`blastp -db $db -query $input -out $output.bacmet.report.temp -evalue $E -seg no -outfmt 6 -num_threads $cpu`;
}
}
if ($software eq "blastall") {
if ($protein == 0) {
`blastall -p blastx -d $db -i $input -o $output.bacmet.report.temp -e $E -F F -m 8 -a $cpu`;
} else {
`blastall -p blastp -d $db -i $input -o $output.bacmet.report.temp -e $E -F F -m 8 -a $cpu`;
}
}
if ($software eq "blat") {
if ($protein == 0) {
`blat -t=dnax -q=prot -out=blast8 $input $db.fasta $output.bacmet.report.temp`;
} else {
`blat -t=prot -q=prot -out=blast8 $input $db.fasta $output.bacmet.report.temp`;
}
}
if ($software eq "pblat") {
if ($protein == 0) {
`pblat -t=dnax -q=prot -out=blast8 -threads=$cpu $input $db.fasta $output.bacmet.report.temp`;
} else {
`pblat -t=prot -q=prot -out=blast8 -threads=$cpu $input $db.fasta $output.bacmet.report.temp`;
}
}
if ($software eq "vmatch") {
if ($protein == 0) {
`vmatch -dnavsprot 1 -l $L -h 2 -evalue $E -showdesc 80 -q $input $db > $output.bacmet.report.temp`;
} else {
`vmatch -l $L -h 2 -evalue $E -showdesc 80 -q $input $db > $output.bacmet.report.temp`;
}
}
if ($software eq "fixst") {
if ($protein == 0) {
`fixst -i $input -d $db -o $output.bacmet.report.temp -e $E -b -f nt`;
} else {
`fixst -i $input -d $db -o $output.bacmet.report.temp -e $E -b -f prot`;
}
}
`cat $output.bacmet.report.temp >> $output.bacmet.report`;
}
$report = "$output.bacmet.report";
}
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : Analyzing $software report...\n";
}
open (REPORT, $report);
while (chomp($line = )) {
if ($software =~ m/blast/) {
($query,$subject,$identity,$length,$mismatches,$gaps,$qs,$qe,$ss,$se,$matchEval,$matchScore) = split('\t',$line);
}
if ($software =~ m/blat/) {
($subject,$query,$identity,$length,$mismatches,$gaps,$ss,$se,$qs,$qe,$matchEval,$matchScore) = split('\t',$line);
}
if ($software =~ m/vmatch/) {
if (substr($line,0,1) ne "#") {
$line =~ s/ */\t/g;
($null, $length, $subject, $matchSS, $DP, $matchQLength, $query, $qs, $mismatches, $matchEval, $matchScore, $identity) = split('\t', $line);
} else {
next;
}
}
if ($software =~ m/fixst/) {
if (substr($line,0,3) eq ">>>") {
# StartMarker | QueryID | MatchID | MappedTo | ReadingFrame | MatchLength | QueryStart | QueryEnd |
# | MatchStart | MatchEnd | P-value | E-value | Score | FullP-value | FullE-value | PrecentIdentity
($null, $query, $subject, $mapping, $rf, $length, $qs, $qe, $ss, $se, $nullPval, $nullEval, $matchScore, $matchPval, $matchEval, $identity) = split('\t', $line);
} else {
next;
}
}
# @subItems = split('\|', $subject);
# $subject = @subItems[3];
if ($length > 0) {
$score = $matchScore / $length;
} else {
$score = $matchScore;
}
if (($matchEval <= $E) && ($identity >= $P) && ($length >= $L) && ($score >= $S)) {
if (exists($hits{$query})) {
($savedSubject,$savedIdentity,$savedLength,$savedEval,$savedScore) = split('\t',$hits{$query});
if (($matchEval < $savedEval) && ($identity > $savedIdentity) && ($length >= $savedLength) && ($score >= $savedScore)) {
$hits{$query} = "$subject\t$identity\t$length\t$matchEval\t$score";
}
} else {
$hits{$query} = "$subject\t$identity\t$length\t$matchEval\t$score";
push(@order,$query);
}
}
}
close REPORT;
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : Generating output...\n";
}
## Remove or move software report
if ($out_report == 1) {
if ($runsoftware == 1) {
`mv $report $output.report`;
}
} else {
if ($runsoftware == 1) {
`rm $report`;
}
}
## Read ID-to-Type mapping
chomp($mappingFile = `ls $db\*mapping.txt`);
open (MAPPING, $mappingFile);
while ($line = ) {
chomp($line);
## BacMet-Scan Version 2
($acc_no,$bacid,$gene,$location,$orgn,$compound,$desc) = split('\t',$line);
if ($bacid eq "") {
$bacid = $acc_no;
}
if ($acc_no eq "") {
$acc_no = $bacid;
}
$mapping{$bacid} = "$gene $location $orgn $compound $desc";
$mapping{$acc_no} = "$gene $location $orgn $compound $desc";
$descriptions{$acc_no} = $desc;
$organisms{$acc_no} = $orgn;
$genes{$acc_no} = $gene;
$locations{$acc_no} = $location;
$compounds{$acc_no} = $compound;
$descriptions{$bacid} = $desc;
$organisms{$bacid} = $orgn;
$genes{$bacid} = $gene;
$locations{$bacid} = $location;
$compounds{$bacid} = $compound;
$counts{$gene} = 0;
}
close MAPPING;
if ($out_table == 1) {
open (TABLE, ">$output.table");
if (scalar(grep(/query/, @columns)) > 0) {
print TABLE "Query\t";
}
if (scalar(grep(/subject/, @columns)) > 0) {
print TABLE "Subject\t";
}
if (scalar(grep(/gene/, @columns)) > 0) {
print TABLE "Gene\t";
}
if (scalar(grep(/description/, @columns)) > 0) {
print TABLE "Description\t";
}
if (scalar(grep(/organism/, @columns)) > 0) {
print TABLE "Organism\t";
}
if (scalar(grep(/location/, @columns)) > 0) {
print TABLE "Location\t";
}
if (scalar(grep(/compound/, @columns)) > 0) {
print TABLE "Compounds\t";
}
if (scalar(grep(/mapping/, @columns)) > 0) {
print TABLE "Complete mapping information\t";
}
if (scalar(grep(/identity/, @columns)) > 0) {
print TABLE "Percent identity\t";
}
if (scalar(grep(/length/, @columns)) > 0) {
print TABLE "Match length\t";
}
if (scalar(grep(/evalue/, @columns)) > 0) {
print TABLE "E-value\t";
}
if (scalar(grep(/score/, @columns)) > 0) {
print TABLE "Score per length\t";
}
print TABLE "\n";
}
foreach $query (@order) {
($savedSubject,$savedIdentity,$savedLength,$savedEval,$savedScore) = split('\t',$hits{$query});
$savedMapping = $mapping{$savedSubject};
$savedGene = $genes{$savedSubject};
$savedDesc = $descriptions{$savedSubject};
$savedOrgn = $organisms{$savedSubject};
$savedLocation = $locations{$savedSubject};
$savedCompound = $compounds{$savedSubject};
if ($savedMapping eq "") {
@accessions = split('\|',$savedSubject);
foreach $seqID (@accessions) {
$savedMapping = $mapping{$seqID};
if ($savedMapping ne "") {
$savedGene = $genes{$seqID};
$savedDesc = $descriptions{$seqID};
$savedOrgn = $organisms{$seqID};
$savedLocation = $locations{$seqID};
$savedCompound = $compounds{$seqID};
last;
}
}
}
if ($savedMapping eq "") {
$savedMapping = $savedSubject;
$savedGene = $savedSubject;
$savedDesc = "Not found in BacMet gene index";
$savedOrgn = "N/A";
$savedLocation = "N/A";
$savedCompound = "N/A";
}
$counts{$savedGene}++;
if ($out_table == 1) {
#query,subject,gene,description,organism,mapping,identity,length,evalue,score
if (scalar(grep(/query/, @columns)) > 0) {
print TABLE $query . "\t";
}
if (scalar(grep(/subject/, @columns)) > 0) {
print TABLE $savedSubject . "\t";
}
if (scalar(grep(/gene/, @columns)) > 0) {
print TABLE $savedGene . "\t";
}
if (scalar(grep(/description/, @columns)) > 0) {
print TABLE $savedDesc . "\t";
}
if (scalar(grep(/organism/, @columns)) > 0) {
print TABLE $savedOrgn . "\t";
}
if (scalar(grep(/location/, @columns)) > 0) {
print TABLE $savedLocation . "\t";
}
if (scalar(grep(/compound/, @columns)) > 0) {
print TABLE $savedCompound . "\t";
}
if (scalar(grep(/mapping/, @columns)) > 0) {
print TABLE $savedMapping . "\t";
}
if (scalar(grep(/identity/, @columns)) > 0) {
print TABLE $savedIdentity . "\t";
}
if (scalar(grep(/length/, @columns)) > 0) {
print TABLE $savedLength . "\t";
}
if (scalar(grep(/evalue/, @columns)) > 0) {
print TABLE $savedEval . "\t";
}
if (scalar(grep(/score/, @columns)) > 0) {
print TABLE $savedScore . "\t";
}
print TABLE "\n";
}
}
if ($out_table == 1) {
close TABLE;
}
if (($out_matrix == 1) || ($out_counts == 1)) {
if ($out_matrix == 1) {
open (MATRIX, ">$output.matrix");
}
if ($out_counts == 1) {
open (COUNTS, ">$output.counts");
}
foreach $gene (sort(keys(%counts))) {
if ($out_counts == 1) {
print COUNTS $gene . "\t" . $counts{$gene} . "\n";
}
if ($out_matrix == 1) {
print MATRIX $counts{$gene} . "\n";
}
}
if ($out_matrix == 1) {
close MATRIX;
}
if ($out_counts == 1) {
close COUNTS;
}
}
if ($out_top == 1) {
open (TOPLIST, ">$output.toplist");
foreach $gene (sort {$counts{$b} <=> $counts{$a}} keys(%counts)) {
if ($counts{$gene} > 0) {
print TOPLIST $gene . "\t" . $counts{$gene} . "\n";
}
}
close TOPLIST;
}
if ($verbose == 1) {
$now = localtime;
print STDERR "$now : Finished!\n";
}