Appendix 1 CRM Database Application Code
SQL Files
schema.sql
--
-- Table structure for table `element`
--
DROP TABLE IF EXISTS `element`;
CREATE TABLE `element` (
`element_id` int(11) NOT NULL default '0',
`gene_id` int(11) default NULL,
`element_type_id` int(11) default NULL,
`name` text,
`position_start` int(11) default NULL,
`position_end` int(11) default NULL,
`sequence` text,
`regulator_id` int(11) default NULL,
PRIMARY KEY (`element_id`),
KEY `gene_id` (`gene_id`),
KEY `element_type_id` (`element_type_id`),
KEY `regulator_id` (`regulator_id`),
CONSTRAINT `element_geneid_fk`
FOREIGN KEY (`gene_id`)
REFERENCES `gene` (`gene_id`),
CONSTRAINT `element_regulatorid_fk`
FOREIGN KEY (`regulator_id`)
REFERENCES `gene` (`gene_id`),
CONSTRAINT `element_typeid_fk`
FOREIGN KEY (`element_type_id`)
REFERENCES `element_type` (`element_type_id`)
);
--
-- Table structure for table `element_type`
--
DROP TABLE IF EXISTS `element_type`;
CREATE TABLE `element_type` (
`element_type_id` int(11) NOT NULL default '0',
`name` text,
PRIMARY KEY (`element_type_id`)
);
--
-- Table structure for table `gene`
--
DROP TABLE IF EXISTS `gene`;
CREATE TABLE `gene` (
`gene_id` int(11) NOT NULL default '0',
`name` text,
`sequence` longtext,
`abbreviation` varchar(12) default NULL,
`motifs` text,
`img` varchar(255) default NULL,
`img_small` varchar(255) default NULL,
PRIMARY KEY (`gene_id`)
);
--
-- Table structure for table `motif`
--
DROP TABLE IF EXISTS `motif`;
CREATE TABLE `motif` (
`motif_id` int(11) NOT NULL default '0',
`name` text,
`abbreviation` varchar(12) default NULL,
`motif` text,
PRIMARY KEY (`motif_id`)
);
--
-- Table structure for table `regulator`
--
DROP TABLE IF EXISTS `regulator`;
CREATE TABLE `regulator` (
`element_id` int(11) NOT NULL default '0',
`gene_id` int(11) NOT NULL default '0',
PRIMARY KEY (`element_id`,`gene_id`),
CONSTRAINT `element_geneid_fk`
FOREIGN KEY (`gene_id`)
REFERENCES `gene` (`gene_id`),
CONSTRAINT `element_fk`
FOREIGN KEY (`element_id`)
REFERENCES `element` (`element_id`)
);
Perl Files
element.cgi
#!/usr/local/bin/perl
# --
# -- HTML form for adding new elements (enhancers, promoters, exons)
# -- to a sequences
# --
use DBI;
use CGI;
use strict;
use lib "/home/zburke/bg/workspace/perl-lib";
use nazina;
use nazina_db;
my $dbh = nazina_db::dbh();
main();
sub main
{
my $cgi = CGI->new();
print $cgi->header();
if ($ENV{'REQUEST_METHOD'} eq "GET") {
do_get($cgi);
} else {
do_post($cgi);
}
}
sub do_get
{
my ($cgi) = @_;
my $dao = nazina_db->new($dbh);
my $gene_id = $cgi->param('gene_id');
my $genes = $dao->genes();
my $select .= "";
print <<"EOT";
EOT
}
sub do_post
{
my ($cgi) = @_;
my $idsth = $dbh->prepare("select max(element_id) as max_id from element");
$idsth->execute();
my $element_id;
while (my ($id) = $idsth->fetchrow_array) {
$element_id = $id;
}
$element_id++;
print STDERR "id is $element_id\n";
my $gene_id = $cgi->param('gene_id');
my $p_start = $cgi->param('p_start');
my $p_end = $cgi->param('p_end');
my $type_id = $cgi->param('type_id');
my $name = $cgi->param('name');
my $sth = $dbh->prepare("insert into element values (?, ?, ?, ?, ?, ?, '')");
$sth->execute($element_id, $gene_id, $type_id, $name, $p_start, $p_end);
print STDERR "sth->execute($element_id, $gene_id, $type_id, $name, $p_start, $p_end);\n";
do_get($cgi);
}
gene.cgi
#!/usr/local/bin/perl
# --
# -- form for gene details (name, abbreviation, motifs, sequence)
# --
use DBI;
use CGI;
use strict;
use lib "/home/zburke/bg/workspace/perl-lib";
use nazina;
use nazina_db;
my $dao = nazina_db->new(); ;
main();
sub main
{
my $cgi = CGI->new();
my $gene_id = $cgi->param('gene_id');
if ($ENV{'REQUEST_METHOD'} eq "GET" && $gene_id) {
print $cgi->header();
do_get($gene_id);
} else {
do_post($cgi);
}
}
# --
# --
# -- do_get
# -- show gene editing form
sub do_get
{
my ($gene_id) = @_;
my $gene = $dao->gene($gene_id);
print <<"EOT";
EOT
}
# --
# --
# -- process form submission and return to index
sub do_post
{
my ($cgi) = @_;
my $gene_id = $cgi->param('gene_id');
my $name = $cgi->param('name');
my $abbv = $cgi->param('abbv');
my $motifs = $cgi->param('motifs');
my $seq = $cgi->param('sequence');
my $sth = $dao->dbh()->prepare("update gene set name = ?, abbreviation = ?, motifs = ? where gene_id = ?");
$sth->execute($name, $abbv, $motifs, $gene_id);
# only set sequence if a new one was submitted
if ($seq) {
my $sth = $dao->dbh()->prepare("update gene set sequence = ? where gene_id = ?");
$sth->execute($seq, $gene_id);
}
print "Location: index.cgi\n\n";
}
index.cgi
#!/usr/local/bin/perl
# --
# -- show all genes with sequence data,
# HTML form for putting new elements (enhancers, promoters, exons)
#
use DBI;
use CGI;
use strict;
use lib "/home/zburke/bg/workspace/perl-lib";
use nazina;
use nazina_db;
my $dao = nazina_db->new();
main();
sub main
{
my ($cgi, $show, $gene_id, $content, $htmx);
$cgi = CGI->new();
$gene_id = $cgi->param('gene_id');
$show = $cgi->param('show');
# show motifs for a gene
if ($show eq "motifs" && $gene_id) {
$content = motif_show($gene_id);
}
# show gene profile
elsif ($show eq "gene" && $gene_id) {
$content = gene_show($gene_id);
}
# show genes grouped by element regulators
elsif ($show eq "regulation") {
$content = regulator_show();
}
# DEFAULT: show profile list for genes with sequence data
else {
$content = gene_show_all();
}
$htmx = nazina::htmx();
$htmx =~ s/~content~/$content/;
print $cgi->header();
print $htmx;
}
# --
# --
# -- gene_show_all
# -- show small picture and name for all genes with sequence data
# --
sub gene_show_all
{
my $content = "";
my $genes = $dao->genes();
$content .= "
\n";
return $content;
}
# --
# --
# -- motif_show
# -- return motifs for a given gene
sub motif_show
{
my ($gene_id) = @_;
return "
".$dao->motifs($gene_id)."
";
# my $sth = $dbh->prepare("select motifs from gene where gene_id = ?");
# $sth->execute($gene_id);
# my ($motifs) = $sth->fetchrow_array();
# return "
$motifs
";
}
# --
# --
# -- regulator_links
# -- given a list of regulatory elements, return links to display
# -- their motifs in a single string.
sub regulator_links
{
my ($regulators) = @_;
my @links = ();
for ( sort { $a->{'abbv'} cmp $b->{'abbv'} } @{ $regulators }) {
push @links, "$_->{'abbv'}";
}
return (join ", \n", @links);
}
# --
# -- regulator_show
# -- show genes grouped by regulator elemetns
# --
sub regulator_show
{
my $content = "";
my $genes = $dao->genes();
$content .= "
\n";
my $sth = $dao->dbh()->prepare("
select distinct r.gene_id, g.name
from regulator r, gene g
where r.gene_id = g.gene_id
order by g.name");
$sth->execute();
my $esth = $dao->dbh()->prepare("
select e.gene_id, e.name e_name, g.name g_name, g.abbreviation
from element e, gene g, regulator r
where
e.gene_id = g.gene_id
and e.element_id = r.element_id
and r.gene_id = ?
order by g.name");
# loop over regulator elements
while (my ($r_id) = $sth->fetchrow_array()) {
my $r = ${ $genes }{$r_id};
$content .= "
";
# fetch elements regulated by this
$esth->execute($r_id);
while (my ($gene_id, $e_name, $g_name, $abbv) = $esth->fetchrow_array()) {
$content .= "$g_name ($abbv); $e_name \n";
}
$content .= "
\n";
}
$content .= "
";
return $content;
}
motifs.cgi
#!/usr/local/bin/perl
# --
# -- show motifs (TFBS?) for a given regulatory element
# --
use DBI;
use CGI;
use strict;
use lib "/home/zburke/bg/workspace/perl-lib";
use nazina;
use nazina_db;
my $dbh = nazina_db::dbh();
main();
sub main
{
my ($cgi, $gene_id, $content, $htmx);
$cgi = CGI->new();
$gene_id = $cgi->param('gene_id');
open HTMX, "template.htmx";
while () { $htmx .= $_; };
close HTMX;
if ($gene_id) {
$content = gene_show($gene_id);
} else {
$content = gene_show_all();
}
$htmx =~ s/~content~/$content/;
print $cgi->header();
print $htmx;
}
# --
# --
# -- gene_show_all
# -- show small picture and name for all genes with sequence data
# --
sub gene_show_all
{
my $content = "";
my $genes = nazina::genes($dbh);
$content .= "";
$content .= "
\n";
}
close INDEX;
}
}
compare2.pl
#!/usr/local/bin/perl
# --
# -- 2005-07-19, zak.burke@dartmouth.edu
# -- given directories with motif-density profiles for different motif counts,
# -- generate HTML files that include the same gene at all different counts
# -- in order to see how different counts compare.
# --
# -- this should helps us determine how many of the top-N motifs from SCOPE we
# -- should use when trying to rank the pCRM sequences that were fed to SCOPE.
# --
use strict;
use Getopt::Long;
use constant TRUE => 1;
use constant FALSE => 0;
main();
sub main
{
my $params = {
"genes" => [],
"numbers" => [],
"rounds" => [],
"help" => FALSE,
};
GetOptions(
$params,
'genes=s@',
'numbers=s@',
'rounds=s@',
'help!',
);
$params->{'genes'} = [split ",", (join ",", @{ $params->{'genes'} } )];
$params->{'numbers'} = [split ",", (join ",", @{ $params->{'numbers'} } )];
$params->{'rounds'} = [split ",", (join ",", @{ $params->{'rounds'} } )];
for my $gene (@{ $params->{'genes'} }) {
mkdir $gene unless -d $gene;
my $dir_name = "round1/$gene.$params->{'numbers'}->[0]";
opendir DIR, $dir_name or die("couldn't open $dir_name: $!");
my @regulators = grep /png$/, readdir DIR;
closedir DIR;
open INDEX, ">$gene/index.html";
for my $reg (sort @regulators) {
$reg =~ /(.*)\.motif_profile\.png/;
my $name = $1;
open FILE, ">$gene/compare.$name.html";
print FILE "
\n";
for my $number (@{ $params->{'numbers'} }) {
print FILE "
\n";
}
close INDEX;
}
}
motif_rank.pl
#!/usr/bin/perl -w
# --
# -- 2005-07-12, zak.burke@dartmouth.edu
# --
# -- given a list of sequences and a list of motifs, draw a heat map of
# -- motif density for each sequence.
# --
# -- graphing routines based on overlap.pl
# --
# --
BEGIN {
# output FASTA filename
use constant OUTPUT_FASTA_FILE => "update.fa";
# output phi-score filename
use constant OUTPUT_PHI_SCORE_FILE => "../phi_score.txt";
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'HOME_PERL_LIB'} = "." unless $ENV{'HOME_PERL_LIB'};
$time_beg = time();
}
END {
print STDERR util::elapsed_time(time() - $time_beg);
}
# --
# -- nothing to configure below
# --
use strict;
use Getopt::Long;
use Data::Dumper;
use lib $ENV{'BG_PERL_LIB'};
use scan_parse;
use nazina_db;
use extract;
use gene;
use motif_rank;
use motif_rank_plot;
use util;
# height of a true CRM
use constant PLOT_HEIGHT => 0.4;
# unbuffer stdout
$|++;
main();
sub main
{
my $time_beg = time();
# parse command line options, then show help if necessary
my $params = init();
usage() if $params->{'help'};
usage() unless ($params->{'fasta'} && $params->{'motifs'});
# read input files
my $pcrms = motif_rank::fasta_read($params);
my $motifs = motif_rank::motif_read($params);
my %summary = ();
my $block_count = 0;
# parse motifs in distinct genes
for my $abbv (sort keys %{ $pcrms }) {
print STDERR "locating motifs in $abbv...\n";
# pCRM blocks for this gene
my $blocks = ${ $pcrms }{$abbv};
$block_count += scalar @{ $blocks };
$summary{$abbv} = sequence_motif_parse($abbv, $blocks, $motifs, $params);
}
# rank motifs based on SCOPE's phi_score and coverage of pCRM blocks in
# the input sequence
print STDERR "re-ranking SCOPE's motifs...\n";
$motifs = motifs_rank($motifs, $block_count, scalar keys %{ $pcrms }, $params);
motifs_write($motifs);
# splice out extra motifs
plottable_motifs($motifs, $params);
# plot pCRMs for each distinct gene
for my $abbv (sort keys %{ $pcrms }) {
print STDERR "plotting $abbv...\n";
# pCRM blocks for this gene
my $blocks = ${ $pcrms }{$abbv};
$summary{$abbv} = sequence_plot($summary{$abbv}, $motifs, $params);
}
motif_rank::phi_write(\%summary);
# plot frequency distributions across all genes, e.g. if there are
# 5 sequences with 10 pCRMs each, rank pCRMs from 1-50 instead of
# 1-10 within each sequence.
motif_rank::summary_plot(\%summary) if $params->{'summary'};
}
# --
# --
# -- motifs_write
# -- write re-ranked motifs to motifs.reranked.txt
sub motifs_write
{
my ($motifs) = @_;
my $filename = "motifs.rerank.txt";
open RERANK, ">", $filename or die("couldn't create '$filename': $!");
printf RERANK "\%20s \%8s \%8s \%8s \%8s\n", "motif", "sig", "p-cover", "g-cover", "pipeline";
for (@{ $motifs }) {
printf RERANK "\%20s \%6f \%6f \%6f \%6f\n",
$_->{'motif'}, $_->{'sig_score'}, $_->pcrm_block_coverage_score, $_->pcrm_gene_coverage_score, $_->pipeline_score;
}
close RERANK;
}
# --
# --
# -- motifs_rank
# -- rank motifs by SCOPE's sig_score and their coverage score
sub motifs_rank
{
my ($motifs, $block_count, $gene_count, $params) = @_;
# set motif scores ...
for my $motif (@{ $motifs }) {
# how well does this motif cover pCRMs in the set?
$motif->pcrm_block_coverage_score($motif->pcrm_block_count() / $block_count);
# how well does this motif cover genes in the set?
$motif->pcrm_gene_coverage_score($motif->pcrm_gene_count() / $gene_count);
# combine sig and gene-coverage scores
$motif->pipeline_score($params->{'score_m_sig'}, $params->{'score_m_pcrm'});
}
# filter out motifs without full gene coverage
my @list = grep { $_->pcrm_gene_coverage_score() == 1 } @{ $motifs };
# ... and sort by combined (pipeline) score, high (best) to low (worst)
#@list = sort { $b->pipeline_score() <=> $a->pipeline_score() } @list;
return \@list;
}
# --
# -- sequence_motif_parse
# -- find motif positions in sequences
sub sequence_motif_parse
{
my ($abbv, $blocks, $motifs, $params) = @_ ;
my $seq = {};
$seq->{'abbv'} = $abbv;
$seq->{'blocks'} = $blocks;
# initialize motif positions for this gene
for my $motif (@{ $motifs }) { $motif->{'blocks'} = []; }
# CRM blocks for this gene
$seq->{'gene'} = motif_rank::gene($abbv);
$seq->{'crm_blocks'} = $seq->{'gene'}->enhancers_by_regulator($params->{'regulator'});
# count motif density in each block
my $count_distribution = motif_rank::motif_count($seq->{'blocks'}, $motifs, $seq->{'gene'});
for my $motif (@{ $motifs }) {
$seq->{'motif_blocks'}{$motif->{'motif'}} = $motif->{'blocks'};
}
return $seq;
}
# --
# --
# -- sequence_plot
# -- plot pCRMs and motif positions in a sequence
sub sequence_plot
{
my ($seq, $motifs, $params) = @_;
print STDERR "parsing $seq->{'abbv'}...\n";
my $count_distribution = plot_motif_count($seq->{'blocks'}, $motifs, $params);
my $plot_fields = {};
$plot_fields->{'abbv'} = $seq->{'abbv'};
$plot_fields->{'fields'} = [0..($seq->{'gene'}->length() - 1)];
# prepare plots for CRMs, promoters and exons
$plot_fields->{'crms'} = motif_rank::profile($seq->{'gene'}->enhancers(), PLOT_HEIGHT, $seq->{'gene'}->length());
$plot_fields->{'promoters'} = motif_rank::profile($seq->{'gene'}->promoters(), PLOT_HEIGHT, $seq->{'gene'}->length());
$plot_fields->{'exons'} = motif_rank::profile($seq->{'gene'}->exons(), PLOT_HEIGHT, $seq->{'gene'}->length());
# for each block, get score-per-sequence-position from blocks
$plot_fields->{'pcrms'} = motif_rank::blocks2bins($seq->{'blocks'}, $count_distribution, $seq->{'gene'});
# motif positions
$plot_fields->{'y_min'} = 0; # this is reset by reference on the next line
$plot_fields->{'motifs'} = motif_rank::motif_blocks2bins($motifs, $seq->{'gene'}, \$plot_fields->{'y_min'});
$plot_fields->{'y_max'} = PLOT_HEIGHT;
# density values
$plot_fields->{'density_values'} = $count_distribution;
# plot it
print STDERR "\tplotting\n";
motif_rank_plot::plot($plot_fields) if $params->{'plot'};
# new FASTA file keeping only $params->{keep}% of the pCRMs
print STDERR "\twriting new FASTA\n";
motif_rank::fasta_write($seq->{'blocks'}, $seq->{'gene'}, $params) if $params->{'keep'} > 0;
my $scores = motif_rank::phi_score($seq->{'blocks'}, $seq->{'gene'});
my $block_scores = motif_rank::phi_score_block($seq->{'blocks'}, $seq->{'gene'});
# collect block density, profile information for summary plots
return {
'blocks' => $seq->{'blocks'},
'plot_fields' => $plot_fields,
'phi_score' => $scores->{'phi'}, # position-based phi-score
'tp_score' => $scores->{'tp'}, # position-based true positives
'fp_score' => $scores->{'fp'}, # position-based false positives
'tp_score_block' => $block_scores->{'tp'}, # block-based crm-pcrm overlaps
'fp_score_block' => $block_scores->{'fp'}, # block-based crm-pcrm misses
};
}
# --
# --
# -- plot_motif_count
# -- new version of motif_count only counts $params->{'top'} motifs
sub plot_motif_count
{
my ($blocks, $motifs, $params) = @_;
my @counts;
for my $block (@{ $blocks }) {
$block->{'motif_count'} = 0;
for my $motif (@{ $motifs }) {
if ($block->{'motifs'}{$motif->{'motif'}}) {
$block->{'motif_count'} += $block->{'motifs'}{$motif->{'motif'}};
}
}
my $max = length $block->{'sequence'};
# calculate density score
$block->{'density'} = $block->{'motif_count'} / $max;
print "$block->{'abbv'} $block->{'beg'} - $block->{'end'}\n";
print "\ttotal motif-count: $block->{'motif_count'} (density score: $block->{'density'})\n";
print "\tdistinct motifs: ". scalar (keys %{ $block->{'motifs'} }) . " of " . (scalar @{ $motifs }). "\n";
push @counts, $block->{'density'};
}
return \@counts;
}
# --
# --
# -- plottable_motifs
# -- remove low-scoring motifs from a ranked list.
sub plottable_motifs
{
my ($motifs, $params) = @_;
# return top N motifs only, if N > 0
if ($params->{'top'} > 0) {
splice @{ $motifs }, $params->{'top'};
}
return $motifs;
}
# --
# --
# -- init
# -- parse command line options and truncate the output files.
sub init
{
my $params = {
"fasta" => "", # file containing pCRM sequences
"motifs" => "", # file containing motifs found in $params->{fasta}
"top" => -1, # how many motifs from $params->{motifs} to use
"keep" => -1, # percent of pCRM block to write to output FASTA file
"plot" => util::TRUE, # TRUE to generate pCRM plots, ranked by density within each sequence
"summary" => util::TRUE, # TRUE to generate pCRM plots, ranked by density with the set
"raw" => util::FALSE, # TRUE to dump the plot data structure to a file for fast printing later
"block_rm_0" => util::TRUE, # TRUE to always remove all pCRM blocks with 0-density scores
"block_rm_1" => util::TRUE, # TRUE to always remove at least one pCRM block
"score_m_sig" => 1, # motif sig-score multiplier
"score_m_pcrm" => 1, # motif pcrm-coverage-score multiplier
"regulator" => "", # abbv of gene regulating this set
"help" => util::FALSE,# TRUE to show help and quit
};
GetOptions(
$params,
'fasta=s',
'motifs=s',
'top=i',
'keep=i',
'plot!',
'summary!',
'raw!',
'block_rm_0!',
'block_rm_1!',
'score_m_sig=f',
'score_m_pcrm=f',
'regulator=s',
'help!',
);
# truncate OUTPUT_FASTA_FILE
if ($params->{'keep'} > 0) {
open FILE, '>', OUTPUT_FASTA_FILE;
close FILE;
}
return $params;
}
sub raw_write
{
my ($abbv, $plot_fields) = @_;
open FILE, ">$abbv.raw";
print FILE Dumper([$plot_fields]);
close FILE;
}
# --
# --
# -- usage
# -- show how to use program, then exit
sub usage
{
print < --motifs [--top 1-9]
description
calculate motif density in CRM blocks; plot as a heat map
ouput
graphs displaying motifs density per block as a heat map
also spits out .raw files with Data::Dumper for quick plotting
later on; see motif_rank_raw.pl.
arguments
fasta: file containing FASTA sequence blocks
motifs: file containing motifs, one per line
optional arguments; default booleans in ALL CAPS
top: number of motifs to use; default is all
keep: % of pCRM blocks to write to FASTA file; default -1 (none)
[no]plot: generate pCRM plots, ranked by density within each sequence
[no]summary: generate pCRM plots, ranked by density within the set
[NO]raw: dump the plot data structure to a file for fast printing later
[no]block_rm_0: remove all pCRM blocks with 0-density scores
[no]block_rm_1: remove at least one pCRM block
EOT
exit 0;
}
motif_rank_raw.pl
#!/usr/local/bin/perl
# --
# -- 2005-07-19, zak.burke@dartmouth.edu
# -- plots .raw files found in directory; defaults to current directory.
# -- see motif_rank.pl for raw file data structure details.
# --
use strict;
use Getopt::Long;
use GD::Graph;
use GD::Graph::mixed;
use Statistics::Descriptive;
use Data::Dumper;
use lib "/home/zburke/bg/workspace/perl-lib";
use scan_parse;
use nazina_db;
use extract;
use gene;
use motif_rank_plot;
use constant TRUE => 1;
use constant FALSE => 0;
use constant FASTA_LINE_LENGTH => 60;
use constant MOTIF_COUNT_BINS => 11;
main();
sub main
{
my $params = {
"dir" => ".",
"help" => FALSE,
};
GetOptions(
$params,
'dir=s',
'help!',
);
usage() if $params->{'help'};
# find *.raw files in requested directory ...
opendir DIR, $params->{'dir'} or die("couldn't open dir $params->{'dir'}: $!");
my @list = grep /raw$/, readdir DIR;
closedir DIR;
# ... and plot them
for (@list) {
motif_rank_plot::plot(raw_read("$params->{'dir'}/$_"));
}
}
# --
# --
# -- raw_read
# -- read a file written by Data::Dumper back into a datastructure and
# -- return it.
sub raw_read
{
my ($filepath) = @_;
print "reading $filepath...\n";
open FILE, "<$filepath" or die("couldn't open $filepath: $!");
my @lines = ;
close FILE;
my $VAR1;
eval (join "", @lines);
return $VAR1->[0];
}
# --
# --
# -- usage
# -- describe how this program works, then quit.
sub usage
{
print <
description
plots .raw files found in directory; defaults to current directory.
see motif_rank.pl for raw file data structure details.
ouput
graphs displaying motifs density per block as a heat map.
arguments
directory: where to look for raw files; defaults to current directory.
EOT
exit 0;
}
phi_score_reader.pl
#!/usr/bin/perl -w
# --
# -- 2005-08-24, zak.burke@dartmouth.edu
# -- given a formatted phi-score file, pull out and print the
# -- score for the requested gene/round combination. prints -1
# -- if the requested tuple is not found.
# --
# -- fileformat:
#
#round 1
# btd 0.214855286473715
# eve 0.688316606703276
# hb 0.197399783315276
# kni 0.105945545471384
# kr 0.417505924170616
# otd 0.260181268882175
#round 2
# btd 0.1875
# eve 0.60302049622438
# hb 0.209136822773186
# kni 0.10976779888697
# kr 0.42992222052768
# otd 0.267719472767968
# ...
# round n
# ...
# --
# --
use strict;
my ($filename, $round, $abbv);
($filename, $round, $abbv) = @ARGV;
my (%rounds, $i, $gene, $junk);
open FILE, "<", $filename or die ("couldn't open $filename: $!");
while () {
chomp;
if (/^round/) {
($junk, $i) = split /\s+/, $_;
next;
}
# trim leading and trailing whitespace
s/^\s+//g; s/\s+$//g;
my ($gene, $phi) = split /\s+/, $_;
if ($gene eq $abbv && $round eq "round_$i") {
print "$phi";
exit;
}
}
print "-1";
pipeline.pl
#!/usr/bin/perl -w
# --
# -- 2005-08-14, zak.burke@dartmouth.edu
# -- iterate through SCOPE and motif_rank to winnow pCRMs
# --
# -- $Author: zburke $
# -- $Revision: 1.4 $
# -- $Date: 2005/10/21 03:13:38 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'BG_PERL_BIN'} = "/home/zburke/bg/workspace/pipeline" unless $ENV{'BG_PERL_BIN'};
}
# point to script handling sequence parsing, motif-reranking and pCRM ranking
use constant MOTIF_RANK => $ENV{'BG_PERL_BIN'} ."/motif_rank.pl";
# how many times to run SCOPE?
use constant ITERATION_COUNT => 5;
# default: number of motifs to use in ranking pCRM blocks
use constant MOTIF_COUNT => 4;
# default: % of pCRM blocks to keep after each round
use constant PCRM_THRESHOLD => 75;
# --
# -- nothing to configure below
# --
use strict;
use Getopt::Long;
use lib $ENV{'BG_PERL_LIB'};
use scan_parse;
use scope;
use util;
my $params = {
"dir" => "", # directory containing seed FASTA files from LWF
"top" => MOTIF_COUNT, # number of motifs to use in ranking pCRM blocks
"keep" => PCRM_THRESHOLD, # % of pCRM blocks to keep
"plot" => util::TRUE, # whether to draw plots
"score_m_sig" => 1, # motif sig-score multiplier
"score_m_pcrm" => 1, # motif pcrm-coverage-score multiplier
"regulator" => "", # abbv of gene regulating a given set
# (derived from seed sequence name)
"help" => util::FALSE, # whether to show usage details
};
main();
sub main
{
GetOptions(
$params, 'dir=s', 'top=i', 'keep=i',
'score_m_sig=f', 'score_m_pcrm=f',
'plot!',
'help!',
);
$params->{'dir'} = "$ENV{'PWD'}/$params->{'dir'}" unless $params->{'dir'} =~ /^\//;
# show help and exit, if requested
usage() if $params->{'help'};
# extract seed *.fa files from $params->{dir}, then iterate on them
my $seeds = seed_read($params->{'dir'});
for my $seed (@{ $seeds }) {
$seed =~ /^([^\.]+)/;
# abbreviation of gene regulating this set
$params->{'regulator'} = $1;
do_rounds($seed);
}
}
# --
# --
# -- seed_read
# -- get *.fa files in given dir; dies on error.
sub seed_read
{
my ($dir) = @_;
opendir DIR, $dir or die("couldn't open $dir: $!");
my @list = grep /\.fa$/, readdir DIR;
closedir DIR;
return \@list;
}
# --
# --
# -- do_rounds
# -- (SCOPE <=> motif_rank) loop for a given sequence
sub do_rounds
{
my ($seed) = @_;
mkdir $seed;
chdir $seed;
mkdir "round_0";
`cp $params->{'dir'}/$seed round_0/sequence.fasta`;
my $scope = scope->new();
for my $i (1..ITERATION_COUNT) {
`echo "round \t$i" >> phi_score.txt`;
my $previous = "round_" . ($i-1);
my $dir = "round_$i";
mkdir $dir;
chdir $dir;
my $cwd = "$ENV{'PWD'}/$seed";
my $fasta = "$cwd/$previous/sequence.fasta";
# path to SCOPE's output file
$scope->dir_out("$cwd/$dir/testResults/unknown_SCOPE.txt");
$scope->run($fasta);
motifs_write($scope);
my $motif_rank = MOTIF_RANK . " " . (join " ", @{ motif_rank_flags($fasta, $params) });
`$motif_rank > density_profile.txt`;
`mv update.fa sequence.fasta`;
chdir "..";
}
chdir "..";
}
# --
# --
# -- motifs_write
# -- write a list of words to the output file ./motifs.txt
sub motifs_write
{
my ($scope) = @_;
my $filename = "motifs.txt";
open FILE, ">", $filename or die("couldn't create '$filename': $!");
for (@{ $scope->results() }) {
print FILE scan_parse::line_write($_, scope::motif_line_format());
}
close FILE;
}
# --
# --
# -- motif_rank_flags
# -- run-time params for motif_rank.pl
sub motif_rank_flags
{
my ($fasta, $params) = @_;
my @flags;
push @flags,
"--fasta=$fasta",
"--motifs=motifs.txt",
"--top=$params->{'top'}",
"--keep=$params->{'keep'}",
"--nosummary",
"--noplot",
"--score_m_sig=$params->{'score_m_sig'}",
"--score_m_pcrm=$params->{'score_m_pcrm'}",
"--regulator=$params->{'regulator'}",
;
return \@flags;
}
# --
# --
# -- usage
# -- show how to call, then quit
sub usage
{
print <
arguments
dir: directory containing seed FASTA files from LWF
optional arguments
top: number of motifs to use in ranking pCRM blocks [4]
keep: % of pCRM blocks to keep [75]
[no]plot: whether to draw plots
score_m_sig: motif sig-score multiplier [1]
score_m_pcrm: motif pcrm-coverage-score multiplier [1]
[NO]help: show this help
EOT
exit 0;
}
pipeline_compare.pl
#!/usr/bin/perl -w
# --
# -- parse pipeline output files into a summary HTML page.
# --
# -- within $param->{dir}, uses phi_score.txt and other dirs which contain
# -- *.png and motifs.rerank.txt files which are parsed to generate
# -- round-based and summary statistics. all output is written to STDOUT
# -- in HTML.
# --
# -- $Author: zburke $
# -- $Revision: 1.1 $
# -- $Date: 2005/10/23 00:47:47 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'BG_PERL_BIN'} = "/home/zburke/bg/workspace/pipeline" unless $ENV{'BG_PERL_BIN'};
}
use strict;
use Getopt::Long;
use GD::Graph::lines;
use lib $ENV{'BG_PERL_LIB'};
use pipeline_util;
main();
sub main
{
# name of directory to read
my $params = {
"dir" => "",
"motif_count" => 7,
"title" => $ENV{'PWD'},
};
GetOptions($params, 'dir=s', 'motif_count=i', 'title=s');
# read all non-hidden files in the dir
opendir DIR, $params->{'dir'} or die("couldn't open $params->{'dir'}");
my @pairs = grep /90/, readdir DIR;
# my @pairs = grep !/^\./, readdir DIR;
closedir DIR;
my $summary = {};
my @regs;
for my $pair_dir (@pairs) {
next unless -d $pair_dir;
$pair_dir =~ /([^-]+)-to-([^-]+)__*/;
my $sig_multiplier = $1;
my $coverage_multiplier = $2;
opendir DIR, "$params->{'dir'}/$pair_dir" or die("couldn't open $params->{'dir'}/$pair_dir");
my @genes = grep /\.fa$/, readdir DIR;
closedir DIR;
for my $gene_dir (@genes) {
push @regs, $gene_dir;
print "$params->{'dir'}/$pair_dir/$gene_dir\n";
my $run_summary = pipeline_util::phi_score_reader_summary("$params->{'dir'}/$pair_dir/$gene_dir/phi_score.txt");
$summary->{$pair_dir}->{$gene_dir} = summarize($run_summary, "$params->{'dir'}/$pair_dir/$gene_dir");
print "" . ("=" x 60) . "\n";
}
}
for my $gene (@regs) {
my $plot = summary_plot_prepare($summary, $gene);
pipeline_util::summary_graph_bars($summary, $plot, "$gene.png", "rho score");
}
}
# --
# --
# -- summarize
# -- summarize results for a run
sub summarize
{
my ($summary, $dirname) = @_;
my ($phi_list, $rho_list, $rho2_list) = (undef, undef, undef);
my $values = {};
for my $round (sort keys %{ $summary }) {
my ($rho_sum, $rho_avg, $rho2_sum, $rho2_avg, $phi_sum, $phi_avg) = (0, 0, 0, 0);
$rho_sum += $_ for map { "$summary->{$round}->{$_}->{rho}" } sort keys %{ $summary->{$round} };
$rho2_sum += $_ for map { "$summary->{$round}->{$_}->{rho2}" } sort keys %{ $summary->{$round} };
$phi_sum += $_ for map { "$summary->{$round}->{$_}->{phi}" } sort keys %{ $summary->{$round} };
$rho_avg = $rho_sum / (scalar keys %{ $summary->{$round} });
push @{ $rho_list }, $rho_avg;
$rho2_avg = $rho2_sum / (scalar keys %{ $summary->{$round} });
push @{ $rho2_list }, $rho2_avg * 1000;
$phi_avg = $phi_sum / (scalar keys %{ $summary->{$round} });
push @{ $phi_list }, $phi_avg;
print "$round\tphi: $phi_avg\trho: $rho_avg\trho2: $rho2_avg\n";
$values->{$round} = $rho_avg;
}
my $params = {
"rho" => $rho_list,
"rho2" => $rho2_list,
"phi" => $phi_list,
};
my $plot = run_plot_prepare($params);
pipeline_util::summary_graph($params, $plot, "$dirname/trends_summary.png");
return $rho_list;
}
# --
# --
# -- run_plot_prepare
# -- helper for summary_graph. mungs data into GD::graph compatible format
# --
sub run_plot_prepare
{
my ($params) = @_;
my (@data, @types, @legend);
for (keys %{ $params }) {
push @data, $params->{$_};
push @types, "lines";
push @legend, $_;
}
# labels for rounds; must be first @data element
my @rounds;
for (my $i = 1; $i <= scalar @{$data[0]}; $i++) {
push @rounds, $i;
}
unshift @data, \@rounds;
return {
"legend" => \@legend,
"data" => \@data,
"types" => \@types,
};
}
# --
# --
# -- summary_plot_prepare
# -- $params->{$pair}->{$gene}->{$round} = score
sub summary_plot_prepare
{
my ($params, $gene) = @_;
my (@data, @types, @legend, @rounds);
for (sort keys %{ $params }) {
if ($params->{$_}->{$gene}) {
for (my $i = 0; $i < (scalar @{ $params->{$_}->{$gene} }); $i++) {
$data[$i] = [] unless $data[$i];
push @{ $data[$i] }, ${ $params->{$_}->{$gene} }[$i];
}
push @types, "bars";
push @rounds, $_;
}
}
# labels for rounds; must be first @data element
unshift @data, \@rounds;
return {
"data" => \@data,
"types" => \@types,
};
}
pipeline_pictures.pl
#!/usr/bin/perl -w
# --
# -- parse pipeline output files into a summary HTML page.
# --
# -- within $param->{dir}, uses phi_score.txt and other dirs which contain
# -- *.png and motifs.rerank.txt files which are parsed to generate
# -- round-based and summary statistics. all output is written to STDOUT
# -- in HTML.
# --
# -- $Author: zburke $
# -- $Revision: 1.5 $
# -- $Date: 2005/10/23 00:47:47 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'BG_PERL_BIN'} = "/home/zburke/bg/workspace/pipeline" unless $ENV{'BG_PERL_BIN'};
}
use strict;
use Getopt::Long;
use lib $ENV{'BG_PERL_LIB'};
use pipeline_util;
main();
sub main
{
# name of directory to read
my $params = {
"dir" => "",
"motif_count" => 7,
"title" => $ENV{'PWD'},
};
GetOptions($params, 'dir=s', 'motif_count=i', 'title=s');
# read all non-hidden files in the dir
opendir DIR, $params->{'dir'} or die("couldn't open $params->{'dir'}");
my @rounds = grep !/^\./, readdir DIR;
closedir DIR;
# print a table with a row for each round; each row contains a cell
# for each gene
print "$params->{'title'}
\n";
for my $round (@rounds) {
print round($round, $params);
}
print "
\n";
gene_print($params);
my $plot = pipeline_util::summary_graph_data($params);
pipeline_util::summary_graph($params, $plot, "trends.png");
}
# --
# --
# -- round
# -- gather information about each gene in the given round into a
# -- and return it
sub round
{
my ($round, $params) = @_;
my $content = "";
# skip non-directory items, i.e. everything but round_0..round_n
return unless -d "$params->{'dir'}/$round";
# holder for calculating avg phi-score for this round
my @phi_total;
my @scores;
$content .= "
\n";
# find all image files in this round
opendir DIR, "$params->{'dir'}/$round";
my @files = grep /.*motif_profile.png$/, readdir DIR;
closedir DIR;
for my $file (@files) {
my @words = split /\./, $file;
my $abbv = $words[0];
# phi-score for this round-gene pair
my $score = pipeline_util::phi_score_reader("$params->{'dir'}/phi_score.txt", $round, $abbv);
push @phi_total, $score->{'phi'};
push @scores, $score;
my $filename = "$params->{'dir'}/$round/motifs.rerank.txt";
my $motifs = join ", ", @{ pipeline_util::motifs_reader($filename, $params) };
$content .= <
\n";
return $content;
}
# --
# --
# -- phi_summary
# -- return phi score summary in a TD, and store it in $$phi_avg as a side-effect
sub score_summary
{
my ($scores, $phi_avg) = @_;
my $content = "";
my ($tp_total, $fp_total) = (0, 0);
$$phi_avg = pipeline_util::score_average($scores, "phi");
$tp_total += $_ for map { $_->{'tp'} } @{ $scores };
$fp_total += $_ for map { $_->{'fp'} } @{ $scores };
my $ratio = 0.0;
$ratio = ($tp_total / $fp_total) if ($fp_total > 0);
my $rho_ratio = 0.0;
$rho_ratio = ($tp_total / ($tp_total + $fp_total)) if ($fp_total > 0);
$content .= "
\n";
$content .= "
avg. phi score: $$phi_avg
\n";
$content .= "
tp/fp: $tp_total / $fp_total ($ratio)
\n";
$content .= "
rho: $rho_ratio
\n";
$content .= "
\n";
return $content;
}
# --
# --
# -- gene_print
# -- print rounds for each specific gene into a file named $gene.html
sub gene_print
{
my ($params) = @_;
for (sort keys %{ $params->{'genes'} }) {
my $details = $params->{'genes'}->{$_};
open GENE, ">", "$_.html";
print GENE "$params->{'title'}";
print GENE "
\n";
for my $round_key (sort keys %{ $details } ) {
my $round = $details->{$round_key};
print GENE <
phi: $round->{'phi'} $round->{'motifs'}
EOT
}
print GENE "
\n";
close GENE;
}
}
regulators.pl
#!/usr/bin/perl
# --
# -- for each regulatory element, find all the elements it regulates
# -- and write them to a FASTA file named for the regulator.
# --
# -- $Author: zburke $
# -- $Revision: 1.5 $
# -- $Date: 2005/10/23 00:47:47 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'BG_PERL_BIN'} = "/home/zburke/bg/workspace/pipeline" unless $ENV{'BG_PERL_BIN'};
}
use strict;
use Getopt::Long;
use lib $ENV{'BG_PERL_LIB'};
use nazina_db;
use util;
main();
sub main
{
my $dao = nazina_db->new();
my $list = $dao->regulatory_elements();
for my $id (sort {$list->{$a}->{'name'} cmp $list->{$b}->{'name'} } keys %{ $list }) {
my $r = $list->{$id};
my $filename = "$r->{'abbv'}.fa";
open FILE, ">$filename" or die("couldn't create $filename: $!");
print "$r->{'name'}\n";
for my $element (sort { $a->{gene}->{abbv} cmp $b->{gene}->{abbv} } @{ $r->{'elements'} }) {
print "\t$element->{gene}->{name} ($element->{name})\n";
print FILE util::fasta(
$element->{'sequence'},
"$element->{'gene'}->{'name'} $element->{'position_start'}-$element->{'position_end'}"
);
}
}
}
summary.pl
#!/usr/local/bin/perl
# --
# -- 2005-08-07, zak.burke@dartmouth.edu
# -- given directories with summary motif density plots (i.e. motif density
# -- is colored based on rank within all pCRMs in the set, not just within
# -- the sequence), create an HTML page that shows all these summaries.
# --
# --
use strict;
use Getopt::Long;
use constant TRUE => 1;
use constant FALSE => 0;
main();
sub main
{
my $params = {
"genes" => [],
"numbers" => [],
"help" => FALSE,
};
GetOptions(
$params,
'genes=s@',
'numbers=s@',
'help!',
);
$params->{'genes'} = [split ",", (join ",", @{ $params->{'genes'} } )];
$params->{'numbers'} = [split ",", (join ",", @{ $params->{'numbers'} } )];
for my $gene (@{ $params->{'genes'} }) {
my $dir_name = "$gene.$params->{'numbers'}->[1]";
opendir DIR, $dir_name or die("couldn't open $dir_name: $!");
my @regulators = grep /summary\.motif_profile\.png$/, readdir DIR;
closedir DIR;
my %summary;
for my $number (@{ $params->{'numbers'} }) {
open FILE, ">$gene.$number/summary.html";
for my $reg (sort @regulators) {
print FILE " \n";
push @{ $summary{$reg} }, "";
}
close FILE;
}
open FILE, ">$gene.summary.html";
print FILE "
\n";
for my $row (sort keys %summary) {
print FILE "
\n\t
". (join "
\n\t
", @{ $summary{$row} } ) . "
\n
\n";
}
print FILE "
";
close FILE;
}
}
Appendix 4 Library Code
Perl Files
extract.pm
package extract;
# --
# -- 2005-06-28, zak.burke@dartmouth.edu
# -- based on /lwf/analysis/hm_parse_best.pl, extract certain tuples
# -- from the output file created by hm_parse.pl.
# --
BEGIN
{
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
}
use strict;
use lib $ENV{'BG_PERL_LIB'};
use scan_parse;
use constant TRUE => 1;
use constant FALSE => 0;
# --
# --
# -- extract
# -- parse an output file from hm.pl
sub extract
{
my ($file, $conditions) = @_;
my @lines = ();
# parser for fields embedded in filename
my $field_parser = scan_parse::field_parser('gene');
# parse lines into records; grab first line to determine format
open FILE, $file or die("couldn't open $file: $!");
my $header = ;
my $line_parser = scan_parse::parser($header);
while () {
chomp;
# skip header lines
next unless /^[0-9]/;
my $line = scan_parse::field_parse($_, $line_parser, $field_parser);
# keep lines satisfying a condition-set
push @lines, $line if condition_matches($conditions, $line);
}
close FILE;
return \@lines;
}
# --
# --
# -- condition_matches
# -- returns TRUE if the given line matches at least one of the
# -- given condition-sets
sub condition_matches
{
# conditions: array-ref of condition-sets
# line: list of values to match against a condition-set
my ($conditions, $line) = @_;
# assume there is no match;
my $match = 0;
for my $condition (@{ $conditions }) {
# how many conditions are there in this condition-set?
my $match_required = keys %{ $condition };
# how many conditions matched so far?
my $match_count = 0;
for my $field (keys %{ $condition }) {
$match_count++ if ($condition->{$field} == $line->{$field});
}
# matched all parameters in a condition-set;
return TRUE if ($match_count == $match_required);
}
# line failed to match a condition-set
return FALSE;
}
return 1;
gene.pm
package gene;
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.4 $
# -- $Date: 2005/10/20 05:19:04 $
# --
sub new
{
my $class = shift;
# capture gene_id, name, abbv, sequence
my %input = @_;
my $self = \%input;
bless $self, $class;
return $self;
}
# --
# --
# -- return ratio of cumulative CRM length to total sequence length
sub s2n
{
my $self = shift;
my $seq_len = length($self->{'sequence'});
# can't be any CRMs if we haven't any sequence
return 0 unless $seq_len;
if (! $self->{'enhancers'}) {
$self->{'enhancers'} = $self->enhancers();
}
my $crm_len = 0;
for (@{ $self->{'enhancers'} }) {
$crm_len += ($_->{'end'} - $_->{'beg'}) + 1;
}
return ($crm_len / $seq_len);
}
# --
# --
# -- enhancers
# -- return list of enhancer elements for this gene
sub enhancers
{
my $self = shift;
return $self->{'enhancers'} if $self->{'enhancers'};
$self->{'enhancers'} = $self->{'dao'}->gene_enhancers($self->{'gene_id'});
return $self->{'enhancers'};
}
# --
# --
# -- enhancers_by_regulator
# -- return the enhancer elements governed by the given regulator
sub enhancers_by_regulator
{
my $self = shift;
my ($regulator) = @_;
my @list;
for (@{ $self->enhancers() }) {
for my $reg (@{ $_->{'regulators'} }) {
if ($reg->{'abbv'} eq $regulator) {
push @list, $_;
}
}
}
return \@list;
}
# --
# --
# -- promoters
# -- return list of promoter elements for this gene
sub promoters
{
my $self = shift;
return $self->{'promoters'} if $self->{'promoters'};
$self->{'promoters'} = $self->{'dao'}->gene_promoters($self->{'gene_id'});
return $self->{'promoters'};
}
# --
# --
# -- exons
# -- return list of exon elements for this gene
sub exons
{
my $self = shift;
return $self->{'exons'} if $self->{'exons'};
$self->{'exons'} = $self->{'dao'}->gene_exons($self->{'gene_id'});
return $self->{'exons'};
}
# --
# --
# -- length
# -- length of the sequence
sub length
{
my $self = shift;
if (! $self->{'length'}) {
$self->{'length'} = length $self->{'sequence'};
}
return $self->{'length'}
}
1;
lwf_plot.pm
package lwf_plot;
# --
# -- 2005-12-14, zak.burke@dartmouth.edu
# -- helper routines for LWF's CRM-pCRM plot scripts,
# -- e.g. lwf/analysis/plot_crm2.pl, overlap.pl
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'HOME_PERL_LIB'} = "." unless $ENV{'HOME_PERL_LIB'};
}
use strict;
use lib $ENV{'BG_PERL_LIB'};
use gene;
use util;
# --
# --
# -- stats_calculate
# -- given true-positive, false-positive, true-negative and false-negative
# -- counts, return a string indicating the pCRM coverage of the entire
# -- sequence and the pCRM coverage of true CRMs.
sub stats_calculate
{
my ($stats) = @_;
my $size = $stats->{'tp'} + $stats->{'fp'} + $stats->{'tn'} + $stats->{'fn'};
my $total_coverage = ($stats->{'tp'} + $stats->{'fp'}) / $size;
my $crm_coverage = $stats->{'tp'} / ($stats->{'tp'} + $stats->{'fn'});
my $phi_score = util::phi_score($stats->{'tp'}, $stats->{'fp'}, $stats->{'fn'});
my $hm = util::harmonic_mean(
util::recall($stats->{'tp'}, $stats->{'fn'}),
util::precision($stats->{'tp'}, $stats->{'fp'}));
return sprintf("t-cover %0.3f; crm-cover: %0.3f; hm: %0.3f; phi: %0.3f", $total_coverage, $crm_coverage, $hm, $phi_score);
}
# --
# --
# -- threshold_normalize
# -- normalize an array such that 0 <= values <= 1
sub threshold_normalize
{
my ($list, $min, $max) = @_;
my (@threshold, $adjust);
# how much to bump up scores to get them all in the range 0 - $max
$adjust = 0 - $min;
$max += $adjust;
for (@{ $list }) {
push @threshold, (($_ + $adjust) / $max);
}
return \@threshold;
}
# --
# --
# -- pip
# -- extract percent-identity-profile for D. Melangaster and D. Pseudoobscura
# --
# -- return
# -- array-ref of percent identity for each position from 1-16000
# -- params
# -- $abbv - abbreviation of gene to extract profile for
# --
sub pip
{
my ($abbv, $pip_file) = @_;
open FILE, $pip_file;
my $line = ;
chomp $line;
my @genes = split /\t/, $line;
my @list = ();
my $pos = 0;
while () {
chomp;
my @scores = split /\t/;
for (my $i = 0; $i < scalar @scores; $i++) {
push @{ $list[$i] }, ($scores[$i] / 100);
}
last if $pos++ > 16000;
}
close FILE;
for (my $i = 0; $i < scalar @genes; $i++) {
next unless $genes[$i] eq $abbv;
return $list[$i];
}
}
1;
motif.pm
package motif;
# --
# -- motif data bit bucket
# --
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.4 $
# -- $Date: 2005/10/23 01:10:06 $
# --
use strict;
sub new
{
my $class = shift;
my $self = {};
bless($self, $class);
my %input = @_;
# letters in the motif
$self->{'motif'} = $input{'motif'} || "";
# blocks where this motif occurs anywhere in a sequence, i.e. not
# necessarily pCRM blocks. in this case, a block is just an item with
# begin and end positions indicating the subsequence this motif matches.
$self->{'blocks'} = $input{'blocks'} || [];
# count of pCRM blocks where this motif occurs
$self->{'pcrm_block_count'} = $input{'pcrm_block_count'} || 0;
# SCOPE's sig score for this motif
$self->{'sig_score'} = $input{'sig_score'} || 0;
# derived properties
if ($self->{'motif'}) {
# number of letters in $self->{'motif'}
$self->{'length'} = length $self->{'motif'};
# regular expression matching $self->{'letters'}
$self->re();
}
return $self;
}
# -- static regular expression building blocks for expanding motifs
# -- defined in the IUPAC alphabet
my %iupac = (
'a' => "[a]",
'c' => "[c]",
'g' => "[g]",
't' => "[t]",
'm' => "[ac]",
'r' => "[ag]",
'w' => "[at]",
's' => "[cg]",
'y' => "[ct]",
'k' => "[gt]",
'v' => "[acg]",
'h' => "[act]",
'd' => "[agt]",
'b' => "[cgt]",
'n' => "[acgt]",
);
# --
# --
# -- re
# -- generate a regular expression matching the expanded iupac alphabet
# -- used to represent the motif
sub re
{
my $self = shift;
if (! $self->{'re'}) {
my @letters = ();
my $re = "";
@letters = split "", $self->{'motif'};
for (@letters) {
$re .= $iupac{$_};
}
$self->{'re'} = $re;
}
return $self->{'re'};
}
# --
# --
# -- pipeline_score
# --
sub pipeline_score
{
my $self = shift;
my ($sig_multiplier, $coverage_multiplier) = @_;
# force recalculation if we have new multipliers
if ($sig_multiplier || $coverage_multiplier) {
$self->{'sig_multiplier'} = $sig_multiplier || $self->{'sig_multiplier'} || 1;
$self->{'coverage_multiplier'} = $coverage_multiplier || $self->{'coverage_multiplier'} || 1;
$self->{'pipeline_score'} = undef;
}
return $self->{'pipeline_score'} if $self->{'pipeline_score'};
$self->{'pipeline_score'} = ($self->{'sig_multiplier'} * $self->{'sig_score'})
+ ($self->{'coverage_multiplier'} * $self->pcrm_gene_coverage_score());
return $self->{'pipeline_score'}
}
# --
# --
# -- pcrm_block_coverage_score
# -- set (if necessary) and return the block coverage score. this is
# -- calculated as (pCRM blocks containing this motif / total pCRM blocks)
sub pcrm_block_coverage_score
{
my $self = shift;
my $score = shift;
$self->{'pcrm_block_coverage_score'} = $score if $score;
return $self->{'pcrm_block_coverage_score'};
}
# --
# --
# -- pcrm_gene_coverage_score
# -- set (if necessary) and return the gene coverage score. this is
# -- calculated as (sequences containing this motif / total sequence count)
sub pcrm_gene_coverage_score
{
my $self = shift;
my $score = shift;
$self->{'pcrm_gene_coverage_score'} = $score if $score;
return $self->{'pcrm_gene_coverage_score'};
}
# --
# --
# -- pcrm_block_count
# -- how many pCRM blocks contain this motif?
sub pcrm_block_count
{
my $self = shift;
if (! $self->{'pcrm_block_count'}) {
$self->{'pcrm_block_count'} = 0;
for my $abbv (keys %{ $self->{'pcrm_blocks'} }) {
$self->{'pcrm_block_count'} += scalar (keys %{ $self->{'pcrm_blocks'}->{$abbv} } );
}
}
return $self->{'pcrm_block_count'};
}
# --
# --
# -- pcrm_gene_count
# -- how many gene sequences conain this motif?
sub pcrm_gene_count
{
my $self = shift;
if (! $self->{'pcrm_gene_count'}) {
$self->{'pcrm_gene_count'} = 0;
for my $abbv (keys %{ $self->{'pcrm_blocks'} }) {
$self->{'pcrm_gene_count'}++ if scalar (keys %{ $self->{'pcrm_blocks'}->{$abbv} } );
}
}
return $self->{'pcrm_gene_count'};
}
1;
__END__
motif_rank.pm
package motif_rank;
# --
# -- given a list of sequences and a list of motifs, draw a heat map of
# -- motif density for each sequence.
# --
# -- graphing routines based on overlap.pl
# --
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.2 $
# -- $Date: 2005/10/20 05:22:44 $
# --
BEGIN {
# output FASTA filename
use constant OUTPUT_FASTA_FILE => "update.fa";
# output phi-score filename
use constant OUTPUT_PHI_SCORE_FILE => "../phi_score.txt";
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'HOME_PERL_LIB'} = "." unless $ENV{'HOME_PERL_LIB'};
}
# --
# -- nothing to configure below
# --
use strict;
use Getopt::Long;
use Data::Dumper;
use lib $ENV{'HOME_PERL_LIB'};
use GD::Graph;
use GD::Graph::mixed;
use Statistics::Descriptive;
use lib $ENV{'BG_PERL_LIB'};
use scan_parse;
use nazina_db;
use extract;
use gene;
use scope;
use pcrm_block;
use motif;
use motif_rank_plot;
use util;
# characters per line in output FASTA file
use constant FASTA_LINE_LENGTH => 60;
# DEPRECATED; distribute pCRMs by density in this many groups
use constant MOTIF_COUNT_BINS => 6;
# height of a true CRM
use constant PLOT_HEIGHT => 0.4;
# distance between motif-position plots
use constant PLOT_INCREMENT => 0.1;
# --
# --
# -- summary_plot
# -- plot pCRMs ranked by density across all sequences in the set, e.g. if
# -- there are 5 sequences with 10 pCRMs each, rank pCRMs from 1-50 instead
# -- of 1-10 within each sequence.
sub summary_plot
{
my ($summary) = @_;
# get array containing all density-profiles
my $list = summary_plot_prepare($summary);
for my $abbv (sort keys %{ $summary }) {
print STDERR "plotting $abbv summary\n";
my $set = ${ $summary }{$abbv};
# initialize plot_fields to zap some of the arrays set above
$set->{'plot_fields'}->{'abbv'} = "$abbv.summary";
$set->{'plot_fields'}->{'pcrms'} = [];
my @empty_pcrm;
for (0..($#{$set->{'plot_fields'}->{'fields'}})) {
push @empty_pcrm, 0;
}
for my $block ( sort {$b->{'density'} <=> $a->{'density'}} @{ $list } ) {
if ($block->{'abbv'} eq $abbv) {
push @{ $set->{'plot_fields'}->{'pcrms'} }, $block->{'plot'};
} else {
push @{ $set->{'plot_fields'}->{'pcrms'} }, [@empty_pcrm];
}
}
# plot it
motif_rank_plot::plot($set->{'plot_fields'});
}
summary_plot_legend($list);
}
# --
# --
# -- summary_plot_prepare
# -- create an array of pCRMs across all sequences in a set
sub summary_plot_prepare
{
my ($summary) = @_;
my @list;
for my $abbv (keys %{ $summary }) {
my $set = ${ $summary }{$abbv};
for my $block (sort {$b->{'density'} <=> $a->{'density'} } @{ $set->{'blocks'} }) {
push @list, {
'abbv' => $abbv,
'density' => $block->{'density'},
'plot' => $block->{'plot'},
};
}
}
return \@list;
}
# --
# --
# -- summary_plot_legend
# -- plot an HTML table that displays a density legend for all pCRMs in the set
sub summary_plot_legend
{
my ($list) = @_;
my $divisor = $#{ $list } || 1;
my $increment = int(255 / $divisor);
open FILE, ">legend.html" or die ("couldn't open legend.html");
print FILE "
";
my $i = 0;
for my $block ( sort {$b->{'density'} <=> $a->{'density'}} @{ $list } ) {
my $color = GD::Graph::colour::rgb2hex(255, $i * $increment, $i * $increment);
print FILE "
$block->{'density'}
\n";
$i++;
}
print FILE "
\n";
close FILE;
}
# --
# --
# -- blocks2bins
# -- convert pCRM blocks into plottable bins.
sub blocks2bins
{
my ($blocks, $count_distribution, $gene) = @_;
my $plot_height = PLOT_HEIGHT / 2;
my $sequence_length = length $gene->{'sequence'};
my (@pcrm_blocks, @zero_blocks);
for my $block (sort {$b->{'density'} <=> $a->{'density'} } @{ $blocks }) {
# handle all blocks with density == 0 in one unit
if ($block->{'density'} == 0) {
push @zero_blocks, $block;
next;
}
my $profile = profile([$block], $plot_height, $sequence_length);
$block->{'plot'} = $profile;
push @pcrm_blocks, $profile;
}
# 0-density handler
if (scalar @zero_blocks) {
my $profile = profile(\@zero_blocks, $plot_height, $sequence_length);
for my $block (@zero_blocks) {
$block->{'plot'} = $profile;
}
push @pcrm_blocks, $profile;
}
return \@pcrm_blocks;
}
# --
# --
# -- motif_blocks2bins
# -- convert a list of motifs and positions where they occur into plottable
# -- objects
sub motif_blocks2bins
{
my ($motifs, $gene, $y_min) = @_;
my @plotable;
my $height = -0.1;
for my $motif (@{ $motifs }) {
push @plotable, profile($motif->{'blocks' }, $height , length $gene->{'sequence'});
$height -= PLOT_INCREMENT;
}
${ $y_min } = $height;
return \@plotable;
}
# --
# --
# -- fasta_read
# -- read fasta data into an array of hash-refs: { abbv, beg, end, sequence }
sub fasta_read
{
my ($params) = @_;
my (@blocks, $block, @lines);
open FILE, $params->{'fasta'} or die("couldn't open $params->{'fasta'}: $!");
while () {
chomp;
if (/^>/) {
if ($block) {
$block->{'sequence'} = lc join "", @lines;
push @blocks, $block;
}
$block = {};
@lines = ();
my ($abbv, $beg, $end) = ($_ =~ />([^\s]+) ([0-9]+) - ([0-9]+)/);
$block = {
'abbv' => $abbv,
'beg' => $beg,
'end' => $end};
next;
}
push @lines, $_;
}
if ($block) {
$block->{'sequence'} = join "", @lines;
push @blocks, $block;
}
close FILE;
my %genes;
for my $block (@blocks) {
push @{ $genes{$block->{'abbv'}} }, $block;
}
return \%genes;
}
# --
# --
# -- fasta_write
# -- slice off the pCRM blocks we choose not to keep, then print FASTA
# -- sequences for the remaining blocks to the file OUTPUT_FASTA_FILE.
# - Because one gene may have multiple regulators, update.fa is opened
# -- for APPENDING. This file was truncated in init();
sub fasta_write
{
my ($blocks, $gene, $params) = @_;
my $sorted = fasta_write_trim($blocks, $params);
# # sort blocks so we can pull off the top N percent
# my @sorted = sort {$a->{'density'} <=> $b->{'density'} } @{ $blocks };
# # $params->{'keep'} specifies the % of blocks to keep
# # always remove at least one block
# my $rm_count = (scalar @sorted / (100 / (100 - $params->{'keep'})));
# $rm_count = ($rm_count < 1 ? 1 : $rm_count);
# splice @sorted, 0, $rm_count;
open FILE, '>>', OUTPUT_FASTA_FILE;
for my $block (sort {$a->{'beg'} <=> $b->{'beg'} } @${ sorted }) {
my $sequence = lc substr($gene->{'sequence'}, $block->{'beg'}, ($block->{'end'} - $block->{'beg'}) + 1);
print FILE ">$gene->{'abbv'} $block->{'beg'} - $block->{'end'}\n";
for (my $i = 0; $i < length($sequence); $i+= FASTA_LINE_LENGTH)
{
print FILE substr($sequence, $i, FASTA_LINE_LENGTH). "\n";
last if (substr($sequence, $i, FASTA_LINE_LENGTH) eq "");
}
print FILE "\n";
}
close FILE;
}
sub fasta_write_trim
{
my ($blocks, $params) = @_;
# sort blocks so we can pull off the top N percent
my @sorted = sort {$a->{'density'} <=> $b->{'density'} } @{ $blocks };
# $params->{'keep'} specifies the % of blocks to keep; convert that
# into a number of blocks to remove.
my $rm_count = (scalar @sorted / (100 / (100 - $params->{'keep'})));
# always remove at least one block
if ($params->{'block_rm_1'}) {
$rm_count = ($rm_count < 1 ? 1 : $rm_count);
}
# count of blocks removed so far
my $removed = 0;
# if there are blocks with 0-density, remove *all* of them, then see
# what is left to remove
if ($params->{'block_rm_0'}) {
while (@sorted) {
if (0 == $sorted[0]->{'density'}) {
shift @sorted;
$removed++;
} else {
last;
}
}
}
if ($removed < $rm_count) {
splice @sorted, 0, ($rm_count - $removed);
}
return \@sorted;
}
# --
# --
# -- gene_select
# -- return list of genes to plot, as specified on the command line
sub gene_select
{
my (%genes) = @_;
# no genes specified so scan 'em all ...
return %genes if ($#ARGV == 0);
# ... or scan named genes only
my %list;
for (my $i = 1; $i < scalar @ARGV; $i++) {
$list{$ARGV[$i]} = $genes{$ARGV[$i]};
}
return %list;
}
# --
# --
# -- motif_read
# -- read motifs from a file, one per line
sub motif_read
{
my ($params) = @_;
my (@motifs);
open FILE, "<", $params->{'motifs'} or die("couldn't open $params->{'motifs'}: $!");
while () {
chomp;
my $motif = motif->new(%{ scan_parse::line_read($_, scope::motif_line_format()) });
push @motifs, $motif;
}
close FILE;
# # return top N motifs only, if N > 0
# if ($params->{'top'} > 0) {
#
# splice @motifs, $params->{'top'};
#
# }
return \@motifs;
}
# --
# --
# -- motif_count
# -- count the number of motifs in each pCRM block, then calculate density
# -- scores (count / block-length) in order to normalize the block-scores.
# -- without normalization, we would inadvertantly reward a long block with
# -- low density and could penalize a short block with high density.
# --
# -- the blocks array is passed by reference so density scores are added to
# -- the block objects as a side effect. for convenience, an array of the
# -- density scores is also returned. obviously, we could generate this
# -- later by looping over the blocks, but it's convenient to create here.
sub motif_count
{
my ($blocks, $motifs, $gene) = @_;
my (@counts);
my $i_max = length $gene->{'sequence'};
for (my $i = 0; $i < $i_max; $i++) {
for my $motif (@{ $motifs }) {
my $word = substr $gene->{'sequence'}, $i, $motif->{'length'};
my $re = $motif->{'re'};
if ($word =~ /^$re$/ ) {
# count motif in block
for my $block (@{ $blocks }) {
if ($i >= $block->{'beg'} && ($i + $motif->{'length'}) <= $block->{'end'}) {
$block->{'motifs'}{$motif->{'motif'}}++;
$block->{'motif_count'}++;
$motif->{'pcrm_blocks'}->{$gene->{'abbv'}}->{$block->{'beg'}} = 1;
last; # can't be in more than one block at a time
}
}
# log position of this motif, which may not be within a pCRM
push @{ $motif->{'blocks'} }, {'beg' => $i, 'end' => ($i + $motif->{'length'})};
}
}
}
for my $block (@{ $blocks }) {
$block->{'motif_count'} = $block->{'motif_count'} || 0;
my $max = length $block->{'sequence'};
# calculate density score
$block->{'density'} = $block->{'motif_count'} / $max;
print "$block->{'abbv'} $block->{'beg'} - $block->{'end'}\n";
print "\ttotal motif-count: $block->{'motif_count'} (density score: $block->{'density'})\n";
print "\tdistinct motifs: ". scalar (keys %{ $block->{'motifs'} }) . " of " . (scalar @{ $motifs }). "\n";
push @counts, $block->{'density'};
}
return \@counts;
}
# --
# --
# -- gene
# -- return a gene object, given its abbreviation
sub gene
{
my ($abbv) = @_;
my $dao = nazina_db->new();
my $list = $dao->genes_by_abbv();
return $dao->gene($list->{$abbv}->{'gene_id'});
}
# --
# --
# -- threshold_normalize
# -- normalize an array such that 0 <= values <= 1
sub threshold_normalize
{
my ($list, $min, $max) = @_;
my (@threshold, $adjust);
# how much to bump up scores to get them all in the range 0 - $max
$adjust = 0 - $min;
$max += $adjust;
for (@{ $list }) {
push @threshold, (($_ + $adjust) / $max);
}
return \@threshold;
}
# --
# --
# -- profile
# -- build plot-data arrays from CRM, pCRM sequence data.
sub profile
{
my ($blocks, $height, $max) = @_;
my @list;
for my $block (@{ $blocks }) {
for (my $i = $block->{'beg'}; $i < $block->{'end'}; $i++) {
$list[$i] = $height;
}
}
$list[$max] = 0 unless $list[$max];
return \@list;
}
# --
# --
# -- phi_score
# -- calculate a phi_score given the pCRMs and CRMs in this sequence
sub phi_score
{
my ($pcrm_blocks, $gene) = @_;
my ($tp, $fp, $fn, $in_pcrm, $in_crm) = (0, 0, 0);
my $crm_blocks = $gene->enhancers();
my $max = length $gene->{'sequence'};
for (my $i = 0; $i < $max; $i++) {
$in_pcrm = util::in_crm($pcrm_blocks, $i); # in pCRM?
$in_crm = util::in_crm($crm_blocks, $i); # in CRM?
if ($in_pcrm && $in_crm) { $tp++; }
elsif ($in_pcrm && ! $in_crm) { $fp++; }
elsif (! $in_pcrm && $in_crm) { $fn++; }
}
my $phi = util::phi_score($tp, $fp, $fn);
return {
'phi' => $phi,
'tp' => $tp,
'fp' => $fp,
'fn' => $fn,
};
}
sub phi_write
{
my ($summary) = @_;
# $summary{$abbv} = {
# 'blocks' => $blocks,
# 'plot_fields' => $plot_fields,
# 'phi_score' => phi_score($blocks, $gene),
open FILE, ">>", OUTPUT_PHI_SCORE_FILE;
for (sort keys %{ $summary }) {
print FILE "\t$_\t$summary->{$_}->{'phi_score'}\t$summary->{$_}->{'tp_score'}\t$summary->{$_}->{'fp_score'}\n";
}
close FILE;
}
# --
# --
# --
# -- cribbed from overlap.pl
sub phi_score_block
{
my ($blocks, $gene) = @_;
my ($tp, $fp, $fn) = (0, 0, 0);
PCRM:
for my $pcrm (@{ $blocks }) {
CRM:
for my $crm (@{$gene->enhancers()}) {
# find the following overlaps:
# 1 2 3 4
# pcrm: AAAA | AAAA | AA | AAAA
# crm: BBBB | BBBB | BBBB | BB
# 4 is special case of 1 and is therefore omitted
# skip to the next one
if ( ($crm->{'beg'} >= $pcrm->{'beg'} && $crm->{'beg'} <= $pcrm->{'end'})
|| ($crm->{'end'} >= $pcrm->{'beg'} && $crm->{'end'} <= $pcrm->{'end'})
|| ($crm->{'beg'} <= $pcrm->{'beg'} && $crm->{'end'} >= $pcrm->{'end'})
) {
$tp++;
next PCRM;
}
}
# didn't match any CRMs
$fp++;
}
# how many CRMs did we miss?
$fn = (scalar @{$gene->enhancers()}) - $tp;
my $phi = util::phi_score($tp, $fp, $fn);
return {
'phi' => $phi,
'tp' => $tp,
'fp' => $fp,
'fn' => $fn,
};
}
1;
motif_rank_plot.pm
package motif_rank_plot;
# --
# -- 2005-07-12, zak.burke@dartmouth.edu
# --
# -- given FASTA-formatted sequences and a list of motifs, rank the
# -- sequences by motif-count.
# --
# -- graphing routines based on overlap.pl
# --
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.10 $
# -- $Date: 2005/10/20 05:21:43 $
# --
BEGIN {
$ENV{'HOME_PERL_LIB'} = "" unless $ENV{'HOME_PERL_LIB'};
}
use strict;
use lib $ENV{'HOME_PERL_LIB'};
use GD::Graph;
use GD::Graph::mixed;
use GD::Graph::hbars;
use constant TRUE => 1;
use constant FALSE => 0;
use constant PLOT_BARS => "bars";
use constant PLOT_LINES => "lines";
use constant PLOT_POINTS => "points";
use constant PLOT_WIDTH => 800;
use constant PLOT_HEIGHT => 125;
use constant PLOT_FGCLRBOX => "white";
use constant PLOT_FGCLR => "black";
use constant CLR_R_MAX => 204;
use constant CLR_MAX => 255;
use constant MOTIF_COLORS => qw(dgray dblue dgreen dred dpurple);
# --
# --
# -- plot
# -- prepare a plot, then create it
sub plot
{
# $params->fields: array-ref, each value is a sequence-offset
# $params->crms: array-ref, non-zero indicates CRM
# $params->pcrms: array-ref of array-refs, non-zero indicates pCRM
# $params->abbv: string, name of gene to plot
my ($params) = @_;
# configure fields, exons, promoters, CRMs
plot_prepare($params);
# prepare pCRM regions
pcrm_color_by_density($params);
# prepare motif regions
motif_lines($params);
# create the graph
graph($params);
}
# --
# --
# -- plot_prepare
# -- configure a plot's fields, exons, promoters and CRMs
sub plot_prepare
{
my ($params) = @_;
# prepare fields (x axis values)
push @{ $params->{'data'} }, $params->{'fields'};
# prepare CRM regions, promoters and exons
plot_bars($params, "exons", "gold", PLOT_BARS);
plot_bars($params, "promoters", "dgreen", PLOT_BARS);
plot_bars($params, "crms", "dblue", PLOT_BARS);
}
# --
# --
# -- graph
# -- write a plot to a .png file
sub graph
{
my ($params) = @_;
eval
{
my $graph = GD::Graph::mixed->new(PLOT_WIDTH, ($params->{'motifs'} ? PLOT_HEIGHT * 2 : PLOT_HEIGHT)) or die GD::Graph->error;
$graph->set(
title => $params->{'abbv'}, # headline
types => $params->{'types'}, # type of each plot (bars|lines|points)
dclrs => $params->{'dclrs'}, # color of each plot
boxclr => PLOT_FGCLRBOX, # background color of plot field
labelclr => PLOT_FGCLR, # color of axis labels
axislabelclr => PLOT_FGCLR, # color of axis-position labels
fgclr => PLOT_FGCLR, # color of axes
markers => [10], # for points, use only the | glyph
x_label_skip => 1000, # skip n ticks and tick-labels along x axis
y_number_format => "", # no labels along y axis
zero_axis_only => 1, # print axis y = 0 and print x tick-labels there
x_label => "sequence position",
y_max_value => $params->{'y_max'} || 1,
y_min_value => $params->{'y_min'} || ($params->{'motifs'} ? -2 : 0),
correct_width => FALSE,
) or die $graph->error;
# legend labels must match the order of fields in $params->{'data'}
$graph->set_legend(qw(Exon Promoter CRM pCRM));
my $gd = $graph->plot($params->{'data'}) or die $graph->error;
open(IMG, ">$params->{'abbv'}.motif_profile.png") or die $!;
binmode IMG;
print IMG $gd->png();
close IMG;
};
if ($@) {
print STDERR "ERROR: $@";
}
}
# --
# --
# -- pcrm_color_by_density
# -- pCRM regions are colored from red->pink->white as motif density sinks
# -- this assumes the @{ $params->{pcrms} } array is already in order from
# -- dense to sparse
sub pcrm_color_by_density
{
my ($params) = @_;
# steps between red and white
my $divisor = $#{ $params->{'pcrms'} } || 1;
my $increment = int(CLR_MAX / int(($divisor+1) / 2 ) );
my (@dcolors, @colors, @roy, @gee, @biv);
for (my $i = 0; $i <= int(($divisor + 1) / 2); $i++) {
push @colors, {'r' => CLR_R_MAX, 'g' => 0, 'b' => $i * $increment};
push @biv, {'r' => CLR_R_MAX, 'g' => $i * $increment, 'b' => CLR_MAX};
}
shift @biv;
push @colors, @biv;
for (my $i = 0; $i <= $#{ $params->{'pcrms'} }; $i++) {
my $color = GD::Graph::colour::rgb2hex(
$colors[$i]->{'r'},
$colors[$i]->{'g'},
$colors[$i]->{'b'}
);
push @dcolors, $color;
GD::Graph::colour::add_colour($color);
push @{ $params->{'dclrs'} }, $color;
push @{ $params->{'types'} }, PLOT_BARS;
push @{ $params->{'data'} }, ${ $params->{'pcrms'} }[$i];
}
# plot a legend
plot_legend(\@dcolors, $params);
}
# --
# --
# -- motif_points
# -- color bars for motifs; rotating rainbow
sub motif_points
{
my ($params) = @_;
return unless $params->{'motifs'};
#my @colors = qw(lgreen lblue lyellow lpurple cyan lorange);
my @colors = MOTIF_COLORS; # qw(lgreen lblue lyellow lpurple cyan lorange);
for (my $i = 0; $i <= $#{ $params->{'motifs'} }; $i++) {
# plot is too cluttered with > 5 motif pictures
last if $i > 12;
my $color = $colors[$i % (scalar @colors)];
push @{ $params->{'dclrs'} }, $color;
push @{ $params->{'types'} }, PLOT_POINTS;
push @{ $params->{'data'} }, ${ $params->{'motifs'} }[$i];
# debugging output
#motif_location_display(${ $params->{'motifs'} }[$i]);
}
}
# --
# --
# -- motif_lines
# -- plot motif locations along a staff of lines
sub motif_lines
{
my ($params) = @_;
return unless $params->{'motifs'};
my $x_max = scalar @{ ${ $params->{'motifs'} }[0] };
#my @colors = qw(lgreen lblue lyellow lpurple cyan lorange);
my @colors = MOTIF_COLORS; #qw(lgreen lblue lyellow lpurple cyan lorange);
for (my $i = 0; $i <= $#{ $params->{'motifs'} }; $i++) {
my $color = $colors[$i % (scalar @colors)];
# determine & configure y-intercept for staff-line
my $height = 0;
for (@{ ${ $params->{'motifs'} }[$i] }) {
if ($height = $_) {
push @{ $params->{'dclrs'} }, $color;
push @{ $params->{'types'} }, PLOT_LINES;
push @{ $params->{'data'} }, plot_line_at($x_max, $height);
last;
}
}
# configure motif positions
push @{ $params->{'dclrs'} }, $color;
push @{ $params->{'types'} }, PLOT_POINTS;
push @{ $params->{'data'} }, ${ $params->{'motifs'} }[$i];
}
}
# --
# --
# -- motif_set_background
# -- paint a big white block from x = 0 to x = sequence-length, y = 0 to y = -1
# -- then we can plot motifs atop it
sub motif_set_background
{
my ($params) = @_;
my (@block, $max);
# number of points in the plot, i.e. length of the sequence
$max = scalar @{ ${ $params->{'motifs'} }[0] };
push @{ $params->{'dclrs'} }, "white";
push @{ $params->{'types'} }, PLOT_BARS;
for (0..$max) {
push @block, -1;
}
push @{ $params->{'data'} }, \@block;
}
# --
# --
# -- plot_line_at
# -- prepare the points to plot a line from 0..x at height y
sub plot_line_at
{
my ($x, $y) = @_;
my (@blocks);
for (0..$x) {
push @blocks, $y;
}
return \@blocks;
}
# --
# --
# -- motif_location_display
# -- prints locations of motifs in a block to STDOUT
sub motif_location_display
{
my ($list) = @_;
my $max = $#{ $list } + 1;
for (my $i = 0; $i < $max; $i++) {
next unless $list->[$i];
print "$i\n";
}
}
# --
# --
# -- plot_bars
# -- configure a particular param field to be plotted as bars
sub plot_bars
{
my ($params, $field, $color, $type) = @_;
push @{ $params->{'data'} }, $params->{$field};
push @{ $params->{'dclrs'} }, $color;
push @{ $params->{'types'} }, $type;
}
# --
# --
# -- plot_legend
# -- create an HTML graph legend mapping density => color
sub plot_legend
{
my ($colors, $params) = @_;
my (@data, @dcolors, @types);
my $i = 0;
for (sort {$b <=> $a} @{ $params->{'density_values'} }) {
my $color = shift @{ $colors };
push @types, "bars";
push @{ $data[0] }, $i;
my @row;
$row[$#{ $params->{'density_values'} }] = 0;
$row[$i] = 1;
push @data, \@row;
push @dcolors, $color;
$i++;
}
eval
{
my $graph = GD::Graph::mixed->new(50, (scalar @{ $data[0]})*20) or die GD::Graph->error;
$graph->set(
types => \@types,
dclrs => \@dcolors,
x_ticks => FALSE,
x_label => "",
y_tick_number => 0,
y_max_value => 1,
y_min_value => 0,
y_number_format => "",
x_number_format => "",
boxclr => "white",
rotate_chart => TRUE,
) or die $graph->error;
my $gd = $graph->plot(\@data) or die $graph->error;
open(IMG, ">$params->{'abbv'}.legend.png") or die $!;
binmode IMG;
print IMG $gd->png();
close IMG;
};
if ($@) {
print STDERR "ERROR: $@";
}
}
1;
nazina.pm
package nazina;
# --
# -- nazina
# --
# --
# -- zak.burke@dartmouth.edu, 2005-03-27
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.2 $
# -- $Date: 2005/10/20 05:19:03 $
# --
use DBI;
use CGI;
use strict;
sub sequence_annotate
{
my ($seq, $elements) = @_;
my ($string, @sequence, $len, $i, $j) = ("", (), 0, 0, 0);
my %colors = (
1 => '#6699cc', # promoter
2 => '#ff0000', # enhancer - early
3 => '#ff0000', # enhancer - late
4 => '#ffff99', # exon
5 => '#ff0000', # enhancer
);
@sequence = split '', $$seq;
# sequence element counts are 1-based so we add an element
# to the beginning of the list to make counting easier
# likewise the for loop is 1.. <= length instead of 0..
unshift @sequence, 'x';
$len = length($$seq);
for ($i = 1; $i < $len; $i++) {
# tag element start
for (@{ $elements }) {
my $color = $colors{$_->{'type_id'}};
$string .= "{'element_id'}\">" if ($i == $_->{'beg'});
}
# current position
$string .= $sequence[$i];
# tag element end
for (@{ $elements }) {
my $color = $colors{$_->{'type_id'}};
$string .= "" if $i == $_->{'end'};
}
# wrap long lines
if ($j++ > 70) {
$string .= "\n";
$j = 0;
}
}
return $string;
}
# --
# --
# -- htmx
# -- read template from file; return it as a string
sub htmx
{
my $htmx = "";
open HTMX, "template.htmx";
while () { $htmx .= $_; };
close HTMX;
return $htmx;
}
1;
nazina_db.pm
package nazina_db;
# --
# -- nazina_db
# -- database connection and fetch routines to assist with
# -- with pulling data from the gene reguation database.
# --
# -- $Author: zburke $
# -- $Revision: 1.7 $
# -- $Date: 2005/10/20 05:19:03 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'HOME_PERL_LIB'} = "." unless $ENV{'HOME_PERL_LIB'};
}
use DBI;
use strict;
use lib $ENV{'BG_PERL_LIB'};
use gene;
use constant USERNAME => "nazina";
use constant PASSWORD => "nazina";
sub new
{
# class name
my $class = shift;
# named input parameters
my %input = @_;
my $self = {};
bless ($self, $class);
# use dbh if it's available, or create a new one.
$self->{'dbh'} = $input{'dbh'} || $self->dbh();
return $self;
}
# --
# --
# -- genes
# -- retrieve all genes from the DB
sub genes
{
my $self = shift;
# return cached copy, if available
return $self->{'genes'} if $self->{'genes'};
my %list;
my $sth = $self->{dbh}->prepare("select gene_id, name, abbreviation, sequence from gene order by abbreviation");
$sth->execute();
while (my ($gene_id, $name, $abbv, $sequence) = $sth->fetchrow_array()) {
$list{$gene_id} = {
'gene_id' => $gene_id,
'name' => $name,
'abbv' => $abbv,
'sequence' => $sequence,
'dao' => $self,
};
}
# cache this collection for fast retrieval later on
$self->{'genes'} = \%list;
return $self->{'genes'};
}
# --
# --
# -- genes_by_abbv
# -- list of genes keyed by their abbreviated names
sub genes_by_abbv
{
my $self = shift;
my $genes = $self->genes();
my %list;
for (keys %{ $genes }) {
$list{ ${ $genes }{$_}->{'abbv'} } = ${ $genes }{$_};
}
return \%list;
}
# --
# --
# -- gene
# -- retrieve gene by id
sub gene
{
my $self = shift;
my ($gene_id) = @_;
my %list;
my $sth = $self->{dbh}->prepare("select name, abbreviation, sequence, motifs from gene where gene_id = ?");
$sth->execute($gene_id);
my ($name, $abbreviation, $sequence, $motifs) = $sth->fetchrow_array();
my $ref = {
'gene_id' => $gene_id,
'name' => $name,
'abbv' => $abbreviation,
'sequence' => $sequence,
'motifs' => $motifs,
'dao' => $self,
};
my $gene = gene->new(%{ $ref });
return $gene;
}
# --
# --
# -- gene_elements
sub gene_elements
{
my $self = shift;
my ($gene_id) = @_;
my @elements;
my $sth = $self->{dbh}->prepare("
select element_id, element_type_id, name,
position_start, position_end
from element
where gene_id = ?
order by position_start");
$sth->execute($gene_id);
while (my ($element_id, $type_id, $name, $beg, $end) = $sth->fetchrow_array()) {
my $regulators = $self->regulators($element_id);
my $element = {
'element_id' => $element_id,
'type_id' => $type_id,
'name' => $name,
'beg' => $beg,
'end' => $end,
'regulators' => $regulators,
};
push @elements, $element;
}
return \@elements;
}
# --
# --
# -- gene_enhancers
# -- return enhancer elements for a gene
sub gene_enhancers
{
my $self = shift;
my ($gene_id) = @_;
my @list;
my $elements = $self->gene_elements($gene_id);
for (@{ $elements }) {
next unless $_->{'type_id'} == 2 || $_->{'type_id'} == 3;
push @list, $_;
}
return \@list;
}
# --
# --
# -- gene_enhancers_by_regulator
# -- get enhancers for a gene which are known to be regulated by the
# -- given gene_id
sub gene_enhancers_by_regulator
{
my $self = shift;
my ($gene_id, $regulator_id) = @_;
my @list = ();
my $enhancers = $self->gene_enhancers($gene_id);
for my $enhancer (@{ $enhancers }) {
for my $reg (@{ $enhancer->{'regulators'} }) {
if ($reg->{'gene_id'} == $regulator_id) {
push @list, $reg;
}
}
}
return \@list;
}
# --
# --
# -- gene_promoters
# -- return promoter elements for a gene
sub gene_promoters
{
my $self = shift;
my ($gene_id) = @_;
my @list;
my $elements = $self->gene_elements($gene_id);
for (@{ $elements }) {
next unless $_->{'type_id'} == 1;
push @list, $_;
}
return \@list;
}
# --
# --
# -- gene_exons
# -- return exon elements for a gene
sub gene_exons
{
my $self = shift;
my ($gene_id) = @_;
my @list;
my $elements = $self->gene_elements($gene_id);
for (@{ $elements }) {
next unless $_->{'type_id'} == 4;
push @list, $_;
}
return \@list;
}
# --
# --
# -- regulators
# -- regulators for a given element
sub regulators
{
my $self = shift;
my ($element_id) = @_;
my @list;
my $sth = $self->{dbh}->prepare("
select g.gene_id, g.name, g.abbreviation
from gene g, regulator r
where r.element_id = ?
and g.gene_id = r.gene_id
");
$sth->execute($element_id);
while (my ($r_id, $name, $abbv) = $sth->fetchrow_array()) {
push @list, {
'gene_id' => $r_id,
'name' => $name,
'abbv' => $abbv,
};
}
return \@list;
}
# --
# --
# -- motifs
# -- return motifs for a given gene
sub motifs
{
my $self = shift;
my ($gene_id) = @_;
my $sth = $self->{dbh}->prepare("select motifs from gene where gene_id = ?");
$sth->execute($gene_id);
my ($motifs) = $sth->fetchrow_array();
return $motifs;
}
# --
# --
# -- regulatory_elements
# -- retreive all genes that are known to regulate other genes
sub regulatory_elements
{
my $self = shift;
my $genes = $self->genes();
my $list = undef;
my $sth = $self->{dbh}->prepare("select gene_id, element_id from regulator");
$sth->execute();
while (my ($gene_id, $element_id) = $sth->fetchrow_array()) {
$list->{$gene_id} = $genes->{$gene_id};
for my $element (@{ $self->elements() }) {
if ($element->{'element_id'} == $element_id) {
$element->{'gene'} = $genes->{ $element->{'gene_id'} };
push @{ $list->{$gene_id}->{'elements'} }, $element;
}
}
}
return $list;
}
# --
# --
# -- elements
# -- all elements
sub elements
{
my $self = shift;
return $self->{'elements'} if ($self->{'elements'});
my @elements;
my $sth = $self->{dbh}->prepare("
select element_id, gene_id, element_type_id, name,
position_start, position_end, sequence
from element");
$sth->execute();
while (my $element = $sth->fetchrow_hashref()) {
$element->{'regulators'} = $self->regulators($element->{'element_id'});
push @elements, $element;
}
$self->{'elements'} = \@elements;
return $self->{'elements'};
}
# --
# --
# -- dbh
# -- new DB handle
sub dbh
{
my $self = shift;
return $self->{dbh} if $self->{dbh};
my $dsn = "DBI:mysql:database=nazina;host=localhost";
$self->{dbh} = DBI->connect($dsn, USERNAME, PASSWORD);
return $self->{dbh};
}
1;
pcrm_block.pm
package pcrm_block;
# --
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.2 $
# -- $Date: 2005/10/20 05:19:03 $
# --
use strict;
sub new
{
my $class = shift;
my $self = {};
bless($self, $class);
my %input = @_;
$self->{'abbv'} = $input{'abbv'} || "";
$self->{'beg'} = $input{'beg'} || -1;
$self->{'end'} = $input{'end'} || -1;
$self->{'motif_count'} = $input{'motif_count'} || 0;
$self->{'density'} = $input{'density'} || 0;
$self->{'sequence'} = $input{'sequence'} || "";
$self->{'motifs'} = $input{'motifs'} || {};
return $self;
}
# --
# --
# -- score
sub score
{
}
1;
__END__
pipeline_util.pm
package pipeline_util;
# --
# -- utility methods for pipeline output files
# --
# -- $Author: zburke $
# -- $Revision: 1.2 $
# -- $Date: 2005/10/23 00:53:09 $
# --
BEGIN {
$ENV{'BG_PERL_LIB'} = "/home/zburke/bg/workspace/perl-lib" unless $ENV{'BG_PERL_LIB'};
$ENV{'BG_PERL_BIN'} = "/home/zburke/bg/workspace/pipeline" unless $ENV{'BG_PERL_BIN'};
}
use strict;
use warnings;
use GD::Graph::lines;
use GD::Graph::bars;
use GD::Graph::hbars;
# --
# --
# -- motifs_reader
# -- given a formatted motifs file, pull out the top motifs
# --
# -- file format:
# --
# motif sig p-cover g-cover pipeline
# grncbs 1.000000 0.857143 1.000000 2.200000
# cgatc 0.244534 0.396825 1.000000 1.444534
# agggg 0.129743 0.380952 1.000000 1.329743
#
sub motifs_reader
{
my ($filename, $params) = @_;
my @motifs;
my @lines = map { chomp; $_;} `head -$params->{'motif_count'} $filename`;
# ignore the header line
shift @lines;
for (@lines) {
# zap leading/trailing whitespace
s/^\s+//g; s/\s+$//g;
my ($motif, $sig, $pcover, $gcover, $pipeline) = split /\s/;
push @motifs, $motif;
}
return \@motifs;
}
# --
# --
# -- phi_score_reader
# -- given a formatted phi-score file, pull out and return the
# -- score for the requested gene/round combination. return -1
# -- if the requested tuple is not found.
# --
# -- fields: gene, phi-score, tp-positions, fp-positions
# -- fileformat:
#
#round 1
# btd 0.208278867102397 956 4117
# eve 0.688316606703276 3635 2186
# hb 0.197399783315276 911 4104
# kni 0.105945545471384 572 4943
# kr 0.417505924170616 2819 2381
# otd 0.260181268882175 2153 6024
#round 2
# btd 0.2333984375 956 3623
# eve 0.210315759394199 1472 1741
# hb 0.209136822773186 911 3845
# kni 0.122065727699531 572 4230
# kr 0.42992222052768 2819 2186
# otd 0.267719472767968 2153 5791# ...
# round n
# ...
sub phi_score_reader
{
my ($filename, $round, $abbv) = @_;
my ($i, $gene, $junk, %rounds) = (0, "", "", ());
my $value = {};
open PHI_FILE, "<", $filename or die ("couldn't open $filename: $!");
while () {
chomp;
if (/^round/) {
($junk, $i) = split /\s+/, $_;
next;
}
# trim leading and trailing whitespace
s/^\s+//g; s/\s+$//g;
my ($gene, $phi, $tp, $fp) = split /\s+/, $_;
if ($gene eq $abbv && $round eq "round_$i") {
$value = {
'phi' => $phi,
'tp' => $tp,
'fp' => $fp,
'tp_fp' => 0.0,
'rho' => 0.0,
};
last;
}
}
close PHI_FILE;
if ($value->{'fp'}) {
$value->{'tp_fp'} = $value->{'tp'} / $value->{'fp'};
$value->{'rho'} = $value->{'tp'} / ($value->{'tp'} + $value->{'fp'});
}
return $value;
}
sub phi_score_reader_summary
{
my ($filename) = @_;
my ($i, $gene, $junk, $summary) = (0, "", "", {});
open PHI_FILE, "<", $filename or die ("couldn't open $filename: $!");
while () {
chomp;
if (/^round/) {
($junk, $i) = split /\s+/, $_;
next;
}
# trim leading and trailing whitespace
s/^\s+//g; s/\s+$//g;
my ($gene, $phi, $tp, $fp) = split /\s+/, $_;
$summary->{$i}->{$gene}->{'phi'} = $phi;
$summary->{$i}->{$gene}->{'tp'} = $tp;
$summary->{$i}->{$gene}->{'fp'} = $fp;
$summary->{$i}->{$gene}->{'tp_fp'} = 0.0;
$summary->{$i}->{$gene}->{'rho'} = 0.0;
$summary->{$i}->{$gene}->{'rho2'} = 0.0;
# calculate tp/fp and tp/(tp + fp) only if fp is non-zero
if ($summary->{$i}->{$gene}->{'fp'}) {
$summary->{$i}->{$gene}->{'tp_fp'} = $summary->{$i}->{$gene}->{'tp'} / $summary->{$i}->{$gene}->{'fp'};
$summary->{$i}->{$gene}->{'rho'} = $summary->{$i}->{$gene}->{'tp'} / ( $summary->{$i}->{$gene}->{'tp'} + $summary->{$i}->{$gene}->{'fp'});
$summary->{$i}->{$gene}->{'rho2'} = $summary->{$i}->{$gene}->{'tp'}
/ ( $summary->{$i}->{$gene}->{'tp'}
+ $summary->{$i}->{$gene}->{'fp'}
* $summary->{$i}->{$gene}->{'fp'}
);
}
}
close PHI_FILE;
return $summary;
}
# --
# --
# -- summary_graph
# -- plot phi-scores for each gene, and average across all genes, for each round.
sub summary_graph
{
my ($params, $plot, $filename) = @_;
eval
{
my $graph = GD::Graph::lines->new(600, 600) or die GD::Graph->error;
$graph->set(
title => "phi scores", # headline
types => $plot->{'types'}, # type of each plot (bars|lines|points)
x_label => "round number",
y_label => "phi score",
# line_width => 4
) or die $graph->error;
# legend labels must match the order of fields in $params->{'data'}
$graph->set_legend(@{ $plot->{'legend'} });
my $gd = $graph->plot($plot->{'data'}) or die $graph->error;
open(IMG, ">$filename") or die $!;
binmode IMG;
print IMG $gd->png();
close IMG;
};
if ($@) {
print STDERR "ERROR: $@";
}
}
# --
# --
# -- summary_graph
# -- plot phi-scores for each gene, and average across all genes, for each round.
sub summary_graph_bars
{
my ($params, $plot, $filename, $title) = @_;
eval
{
my $graph = GD::Graph::hbars->new(400, 1000) or die GD::Graph->error;
$graph->set(
title => $title || "phi score", # headline
types => $plot->{'types'}, # type of each plot (bars|lines|points)
x_label => "",
y_label => $title || "phi score",
# x_labels_vertical => 1,
show_values => 1,
values_format => "%.8f",
# values_vertical => 1,
) or die $graph->error;
# legend labels must match the order of fields in $params->{'data'}
$graph->set_legend(@{ $plot->{'legend'} }) if $plot->{'legend'};
my $gd = $graph->plot($plot->{'data'}) or die $graph->error;
open(IMG, ">$filename") or die $!;
binmode IMG;
print IMG $gd->png();
close IMG;
};
if ($@) {
print STDERR "ERROR: $@";
}
}
# --
# --
# -- summary_graph_data
# -- helper for summary_graph. mungs data into GD::graph compatible format
# --
# -- $params->{'genes'}->{$abbv}->{$round}->{'phi'} = x.y;
# -- $params->{'rounds'} = [a.b, c.d, ...];
sub summary_graph_data
{
my ($params) = @_;
my (@data, @types, @legend);
# prepare scores for each gene
for (sort keys %{ $params->{'genes'} }) {
my $details = $params->{'genes'}->{$_};
push @legend, $_;
my @row = ();
# for this gene, prepare score for each round
for my $round_key (sort keys %{ $details } ) {
my $round = $details->{$round_key};
push @types, "lines";
push @row, $round->{'phi'};
}
push @data, \@row;
}
# prepare rounds' average scores
my @row = ();
for (sort keys %{ $params->{'rounds'} }) {
push @row, $params->{'rounds'}->{$_};
}
push @legend, "average";
push @types, "lines";
push @data, \@row;
# labels for rounds; must be first @data element
my @rounds;
for (my $i = 1; $i <= scalar @row; $i++) {
push @rounds, $i;
}
unshift @data, \@rounds;
return {
"legend" => \@legend,
"data" => \@data,
"types" => \@types,
};
}
# --
# --
# -- score_average
# -- return average score for a given field in a list of score hash-refs
sub score_average
{
my ($scores, $field) = @_;
my $avg = 0.0;
if (scalar @{ $scores }) {
my $sum = 0;
$sum += $_ for map { $_->{$field} } @{ $scores };
$avg = $sum / (scalar @{ $scores });
}
return $avg;
}
1;
__END__
scan_parse.pm
package scan_parse;
# --
# -- package scan_parse
# -- CRM scan results parsers and formatters
# --
# -- 2005-04-04, zak.burke@dartmouth.edu
# --
# -- $Author: zburke $
# -- $Revision: 1.5 $
# -- $Date: 2005/07/12 03:22:11 $
# --
use strict;
# --
# --
# -- parser
# -- return the parser whose head method matches the given line
sub parser
{
my ($line) = @_;
my $parsers = formats();
for (keys %{ $parsers }) {
my $head = &{ ${ $parsers }{$_} }("head");
return ${ $parsers }{$_} if $line =~ /^$head$/;
}
print STDERR "parser $line is not available; returning default parser\n";
return ${ $parsers }{'format_default'};
}
# --
# --
# -- parser_by_name
# -- select a parser by name. the default is returned if the requested
# -- parser is not available.
sub parser_by_name
{
my ($name) = @_;
my $parsers = formats();
return ${ $parsers }{'format_default'} unless $name;
return ${ $parsers }{$name} ? ${ $parsers }{$name} : ${ $parsers }{'format_default'};
}
# --
# --
# -- formats
# -- return available parsers with names as keys and function-refs
# -- as values
sub formats
{
my %list = (
'format_default' => \&format_default,
'format2' => \&format2,
'format3' => \&format3,
'format4' => \&format4,
'format5' => \&format5,
);
return \%list;
}
# --
# --
# -- format_default
# -- default parser
sub format_default
{
my ($what, $item) = @_;
# FIELD NAMES
# hm: harmonic mean score
# phi: phi score
# recall: true-positives / (false-negatives + true-positives)
# precision: true-positives / (false-positives + true-positives)
# file: path to file these stats are derived from
my @fields = qw(hm phi_score recall precision file);
if ($what eq "head") {
return join "\t", @fields;
} elsif ($what eq "read") {
return line_read($item, \@fields);
} elsif ($what eq "write") {
return line_write($item, \@fields);
}
}
# --
# --
# -- format2
# -- formatter with CRM metadata
sub format2
{
my ($what, $item) = @_;
my @fields = qw(hm phi_score recall precision s_to_n file);
if ($what eq "head") {
return join "\t", @fields;
} elsif ($what eq "read") {
return line_read($item, \@fields);
} elsif ($what eq "write") {
return line_write($item, \@fields);
}
}
# --
# --
# -- format3
# -- formatter with CRM metadata and detail scan params
sub format3
{
my ($what, $item) = @_;
my @fields = qw(hm phi_score recall precision s_to_n profile-cutoff peak-width input-smooth supression file);
if ($what eq "head") {
return join "\t", @fields;
} elsif ($what eq "read") {
return line_read($item, \@fields);
} elsif ($what eq "write") {
return line_write($item, \@fields);
}
}
# --
# --
# -- format4
# -- formatter with CRM metadata, train params and scan params
sub format4
{
my ($what, $item) = @_;
my @fields = qw(hm phi_score recall precision s_to_n word window channel profile-cutoff peak-width input-smooth supression file);
if ($what eq "head") {
return join "\t", @fields;
} elsif ($what eq "read") {
return line_read($item, \@fields);
} elsif ($what eq "write") {
return line_write($item, \@fields);
}
}
# --
# --
# -- format5
# -- formatter with CRM metadata, train params and scan params
sub format5
{
my ($what, $item) = @_;
my @fields = qw(hm phi_score recall precision specificity s_to_n word window channel profile-cutoff peak-width input-smooth supression file);
if ($what eq "head") {
return join "\t", @fields;
} elsif ($what eq "read") {
return line_read($item, \@fields);
} elsif ($what eq "write") {
return line_write($item, \@fields);
}
}
# --
# --
# -- line_read
# -- split on tabs, then push fields into hash with keys from
# -- @$fields array. return a hash-ref.
# --
# -- ARGUMENTS
# -- $item -- tab-delimited data
# -- $fields -- array-ref of field-names
sub line_read
{
my ($item, $fields) = @_;
my %list;
my @cols = split "\t", $item;
for (my $i = 0; $i < (scalar @{ $fields} ); $i++) {
$list{${ $fields }[$i]} = $cols[$i];
}
return \%list;
}
# --
# --
# -- line_write
# -- format %$items values into a tab-delimited line of fields from
# -- @$fields
# --
# -- ARGUMENTS
# -- $item - hash ref of data
# -- $fields - ordered list of fields to pluck from $item
sub line_write
{
my ($item, $fields) = @_;
return "" . (join "\t", map { $item->{$_} } @{ $fields }) . "\n";
}
# --
# --
# -- field_parser
# -- return the requested file field-parser
sub field_parser
{
my ($name) = @_;
return $name ? ${ field_parsers() }{$name} : ${ field_parsers() }{'default'}
}
# --
# --
# -- field_parsers
# -- return a hash of line parsers. keys are parser names; values are
# -- function refs.
sub field_parsers
{
my %list;
$list{'ts'} = \&field_parse_ts;
$list{'gene'} = \&field_parse_gene;
$list{'default'} = \&field_parse_gene;
return \%list;
}
# --
# --
# -- field_parse
# -- parse line into a list of fields
# --
# -- ARGUMENTS
# -- $line - line of data to read
# -- $line_parser - function-ref; maps line values to key-value pairs
# -- $file_field_parser - function-ref; extracts values encoded in
# -- line's file field
# --
sub field_parse
{
my ($line, $line_parser, $file_field_parser) = @_;
# parse line into fields
my $list = &{ $line_parser }("read", $line);
# extract fields from the file name ...
my $fields = &{ $file_field_parser }(${ $list }{'file'});
# ... and add them to the fields to be returned.
# note that some filenames include fields and some omit them.
# in the case that the filename OMITS the field, leave the value
# extracted from the line-parsing in place, rather than replacing it
# with the (possibly empty) value extracted from the filename.
for (keys %{ $fields }) {
${ $list }{$_} = ${ $fields }{$_} unless ${ $list }{$_};
${ $list }{$_} = ${ $fields }{$_} if ${ $fields }{$_};
}
return $list;
}
# --
# --
# -- field_parse_ts
# -- split a training-set filename into its constituent parts
# --
# -- ARGUMENTS
# -- $file - filename with embedded metadata
sub field_parse_ts
{
my ($file) = @_;
# fields embedded in filename are:
# abbv: abbreviated gene-name
# word: word-length
# window: window-length
# channel: number of channels
# ts: training set name
# extract fields from the filename; throw out leading directories
my @file = split "\/", $file;
$file = $file[$#file];
my ($abbv, $word, $mismatch, $window, $channel, $ts) = ($file =~ /([^\/]+)\.fa_([0-9]+)_([0-9]+)_([0-9]+)_([0-9]+)_(train_[0-9]+).*/);
return {
'abbv' => $abbv,
'word' => $word,
'mismatch' => $mismatch,
'window' => $window,
'channel' => $channel,
'ts' => $ts,
};
}
# --
# --
# -- field_parse_gene
# -- split a training-set filename into its constituent parts
# --
# -- ARGUMENTS
# -- $file - filename with embedded metadata
sub field_parse_gene
{
my ($file) = @_;
# fields embedded in filename are:
# abbv: abbreviated gene-name
# extract fields from the filename; throw out leading directories
my @file = split "\/", $file;
$file = $file[$#file];
my ($abbv) = ($file =~ /([^\/]+)\.fa_scan.*/);
return {
'abbv' => $abbv,
};
}
# --
# --
# -- pcrm_file_read
# -- read pCRM position information from the FASTA file in $dir_name.
# -- if the file is missing or contains no data, return nothing.
# -- otherwise return an array of hash-refs with position details.
# --
# -- ARGUMENTS
# -- $dir_name - path to directory with LWF scan result files
# --
sub pcrm_file_read
{
my ($dir_name) = @_;
# find the file with the pCRM substrings
opendir SCANDIR, $dir_name;
my @scan = grep { /.*\.fa\.xtr\.dat$/ } readdir SCANDIR;
closedir SCANDIR;
if (! $scan[0]) {
print STDERR "no .fa.xtr.dat file in $dir_name\n";
return;
}
# read the pCRM substrings
open SCAN, "<$dir_name/$scan[0]";
my @lines = ;
close SCAN;
if (! scalar @lines) {
print STDERR "$scan[0] has no content\n";
return ;
}
# get pCRM positions
my @test;
for (@lines) {
# trim trailing whitespace, including damn windows line breaks
chomp;
s/\s+$//g;
my @words = split / /;
my ($beg, $end) = split "-", $words[1];
push @test, {'beg' => $beg, 'end' => $end};
}
return \@test;
}
1;
scope.pm
package scope;
# --
# -- routines helpful for calling SCOPE and parsing its output
# --
# --
# -- $Author: zburke $
# -- $Revision: 1.4 $
# -- $Date: 2005/10/20 05:19:56 $
# --
use strict;
#use constant CLASS_LIB => "/home/zburke/bg/workspace/scopegenie/lib";
use constant CLASS_LIB => "/home/zburke/bg/workspace/scope/lib";
use constant CLASS_PATH => CLASS_LIB."/cos.jar:".CLASS_LIB."/acme.jar:".CLASS_LIB."/scope.jar";
# only motifs with sig scores >= SIG_THRESHOLD will be used for anything
use constant SIG_THRESHOLD => 3.0;
sub new
{
my $class = shift;
my $self = {};
bless($self, $class);
my %input = @_;
return $self;
}
# --
# --
# -- dir_out
# -- get/set the output file to read. the setter doesn't tell SCOPE where
# -- to write its output, rather it tells this module where to find the
# -- output from its run of SCOPE.
sub dir_out
{
my $self = shift;
$self->{'dir_out'} = shift if @_;
return $self->{'dir_out'};
}
# --
# --
# -- parse
# -- parse the output file and cache the results
sub parse
{
my $self = shift;
my $max_score = 0;
open FILE, "<", $self->dir_out() or die("couldn't open $self->{'dir_out'}: $!");
while () {
chomp;
next unless $_;
my ($sig, $word) = split /\s+/, $_;
last if $sig < SIG_THRESHOLD;
# found a new maximum score
$max_score = $sig if $sig > $max_score;
my ($motif, $side) = split ",", $word;
push @{ $self->{results} }, {
'motif' => $motif,
'sig_score' => $sig,
'side' => $side,
};
}
close FILE;
# normalize sig scores into the range 0-1
$self->sig_normalize($max_score);
}
# --
# --
# -- sig_normalize
# -- normalize sig scores into the range 0-1
sub sig_normalize
{
my $self = shift;
my ($max) = @_;
for (@{ $self->{'results'} }) {
$_->{'sig_score'} = $_->{'sig_score'} / $max;
}
}
# --
# -- field
# -- return output values for $field
sub field
{
my $self = shift;
my ($field) = @_;
$self->parse() unless $self->{'results'};
my (@list);
for (@{ $self->{'results'} }) {
push @list, $_->{$field};
}
return \@list;
}
# --
# --
# -- motifs
# -- return motifs SCOPE found
sub motifs
{
my $self = shift;
return $self->field("motif");
}
# --
# --
# -- scores
# -- return sig-scores SCOPE found
sub scores
{
my $self = shift;
return $self->field("sig_score");
}
# --
# --
# -- results
# -- return sig-scores, motifs and side on which SCOPE found
# -- each motif
sub results
{
my $self = shift;
$self->parse() unless $self->{'results'};
return $self->{'results'};
}
# --
# --
# -- run
# -- run SCOPE
sub run
{
my $self = shift;
my ($filename) = @_;
$self->{'results'} = undef;
die("couldn't find $filename") unless -f $filename;
my $flags = join " ", @{ $self->run_flags($filename) };
system("java $flags");
}
# --
# --
# -- run_flags
sub run_flags
{
my $self = shift;
my ($filename) = @_;
my (@flags);
push @flags, "-classpath ".CLASS_PATH;
push @flags, "-mx1000M";
# push @flags, "-Dscope.root=/home/zburke/bg/workspace/scopegenie/";
# push @flags, "-Djava.path=/usr/bin/java";
push @flags, "edu.dartmouth.bglab.beam.Scope";
push @flags, "-pf D_melanogaster_enhancers.param";
push @flags, "-sgs fastaFile";
push @flags, "-sgi $filename";
return \@flags;
}
# --
# --
# -- motif_line_format
# -- ordered list of fields for printing motif information
# -- NOTE: THIS IS A STATIC METHOD
sub motif_line_format
{
my @fields = qw(motif sig_score side);
return \@fields;
}
return 1;
__END__
util.pm
package util;
# --
# -- utility methods
# --
# -- $Author: zburke $
# -- $Revision: 1.2 $
# -- $Date: 2005/09/22 03:32:32 $
# --
use strict;
use constant TRUE => 1;
use constant FALSE => 0;
use constant FASTA_LINE_LENGTH => 60;
# --
# --
# -- recall
# -- given the count of true-positives and false-negatives, calculate
# -- and return the recall rate.
# --
# -- recall is calculated as follows:
# --
# -- R == (true-positives) / (false-negatives + true positives)
# --
sub recall
{
my ($tp, $fn) = @_;
my $sum = $fn + $tp;
return $sum ? ($tp / $sum) : 0;
}
# --
# --
# -- precision
# -- given the count of true-positives and false-positives, calculate
# -- and return the precision rate.
# --
# -- precision is calculated as follows:
# --
# -- P == (true-positives) / (false positives + true positives)
# --
sub precision
{
my ($tp, $fp) = @_;
return ($tp / ($fp + $tp));
}
# --
# --
# -- harmonic_mean
# -- return the harmonic mean of the 2 given rates. The harmonic mean is useful
# -- for calculating an average of rates.
# --
# -- the harmonic mean is calculated as follows:
# --
# -- H == n / ( (1/a[1]) + (1/a[2]) + ... + (1/a[n]) ).
# --
# -- for a simple two-variable equation, this reduces to:
# --
# -- H == (2ab) / (a + b)
# --
sub harmonic_mean($$)
{
my ($recall, $precision) = @_;
return 0 if ($precision + $recall) == 0;
return ((2 * $precision * $recall) / ($precision + $recall));
}
# --
# --
# -- phi_score
# -- calculate a phi_score: true-positives / [false-pos + false-neg]
# --
# -- the phi-score has become a standard metric for describing the
# -- performance of motif finding programs. it was first described in
# -- Pevzner, P.A. and Sze, S.H. (2000) Combinatorial approaches to finding
# -- subtle signals in DNA sequences. Proceedings of the International
# -- Conference on Intelligent Systems in Molecular Biology 8:269-78.
sub phi_score
{
my ($tp, $fp, $fn) = @_;
return ($tp / ($fp + $fn));
}
# --
# --
# -- in_crm
# -- return TRUE if $i is within a defined CRM region in @$list;
# -- return FALSE otherwise.
sub in_crm
{
my ($list, $i) = @_;
for (@{ $list }) {
return TRUE if ($i >= $_->{'beg'} && $i <= $_->{'end'})
}
return FALSE;
}
# --
# --
# -- fpe
# -- floating-point-equality test hack
sub fpe
{
my ($X, $Y, $POINTS) = @_;
$POINTS = 10 unless $POINTS;
my ($tX, $tY);
$tX = sprintf("%.${POINTS}g", $X);
$tY = sprintf("%.${POINTS}g", $Y);
return $tX eq $tY;
}
# --
# --
# -- fple
# -- floating-point-less-than-or-equal test hack
sub fple
{
my ($X, $Y, $POINTS) = @_;
return util::TRUE if ($X < $Y);
return fpe($X, $Y, $POINTS);
}
# --
# --
# -- elapsed_time
# -- return the given time in seconds formatted as hh:mm:ss
sub elapsed_time
{
my ($timer) = @_;
my ($hour, $min, $sec) = (0, 0, 0);
$hour = int($timer / (60 * 60) );
$min = int (($timer / 60) % 60);
$sec = $timer % 60;
return sprintf("\nruntime: %2s:%2s:%2s\n\n",
($hour > 9 ? $hour : "0$hour"),
($min > 9 ? $min : "0$min"),
($sec > 9 ? $sec : "0$sec"));
}
# --
# --
# -- fasta
# -- return a fasta formatted sequence
sub fasta
{
my ($sequence, $comment) = @_;
my $content;
$content .= ">$comment\n" if $comment;
for (my $i = 0; $i < length($sequence); $i+= FASTA_LINE_LENGTH) {
$content .= substr($sequence, $i, FASTA_LINE_LENGTH). "\n";
last if (substr($sequence, $i, FASTA_LINE_LENGTH) eq "");
}
$content .= "\n";
return $content;
}
1;
Appendix 5 Genbank Parser Application Code
Java Files
BglabCommandLine.java
package edu.dartmouth.bglab.pipeline.util;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.MissingArgumentException;
/**
*
* @author $Author: zburke $
* @version $Revision: 1.2 $
* update $Date: 2005/09/22 03:21:34 $
*
*/
public class BglabCommandLine
{
private CommandLine cl = null;
public BglabCommandLine(CommandLine c)
{
this.cl = c;
}
/**
* Return true if the requested option has been set
*
* @param opt short name of option to retrieve
* @return true if the option has been set; false otherwise.
*/
public boolean hasOption(String opt)
{
return this.cl.hasOption(opt);
}
/**
* Retrieve an Option's value.
*
* @param opt short name of option to retrieve
* @return string value of requested option
* @throws MissingArgumentException if the requested Option is undefined
*/
public String value(String opt)
throws MissingArgumentException
{
String value = null;
if (this.cl.hasOption(opt))
value = this.cl.getOptionValue(opt);
if (value == null)
throw new RuntimeException("The option " + opt + " has no value.");
return value;
}
/**
* Retrieve the int value of an option.
*
* @param opt short name of option to retrieve
* @return the integer value of opt
* @throws MissingArgumentException if the requested Option is undefined
*/
public int intValue(String opt)
throws MissingArgumentException
{
String value = null;
if (this.cl.hasOption(opt))
value = this.cl.getOptionValue(opt);
if (value == null)
throw new RuntimeException("The option " + opt + " has no value.");
return Integer.parseInt(value);
}
/**
* Return true if an Option is defined, false otherwise.
*
* @param opt short name of option to retrieve
* @return boolean value of the Option
*/
public boolean booleanValue(String opt)
{
return (this.cl.hasOption(opt) ? true : false);
}
}
Genbank.java
package edu.dartmouth.bglab.pipeline.util;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.impl.RevCompSequence;
import org.biojava.bio.seq.io.*;
import org.biojava.bio.seq.*;
import org.biojava.bio.symbol.*;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
/**
* Genbank file utility methods.
*
* A sample file is available here:
* http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
*
* 2005-08-25, zak.burke@dartmouth.edu
*
*/
public class Genbank
{
/** display distinct features in each sequence */
public boolean logFeatures = false;
/** bucket for distinct feature labels */
private HashSet features = null;
/** display gene locations in each sequence */
public boolean logLocations = false;
/** length of a line in a FASTA-formatted sequence */
public int fastaLineLength = 60;
/** show gene features only */
public boolean filterGenic = false;
/** show non-gene features only */
public boolean filterBg = false;
/** show gene and non-gene features */
public boolean filterAll = false;
/** show features upstream of genes */
public boolean filterUpstream = false;
/** default upstream -sequence length to extract */
public int filterUpstreamLength = 1500;
/** remove {@link #FEATURE_REPEAT} regions from intergenic regions */
public boolean removeRepeats = false;
/** Genbank file region label */
public static String FEATURE_GENE = "gene";
/** Genbank file region label */
public static String FEATURE_REPEAT = "repeat_region";
public static int STRAND_POS = 1;
public static int STRAND_NEG = -1;
/**
* default constructor does nothing at all.
*
*/
public Genbank()
{}
/**
* Converts the Genbank file named by filename into FASTA and prints
* it to ps.
*
* @param filename Genbank-formatted file to read from
* @param ps printstream to write to
* @throws Exception
*/
public void toFasta(String filename, PrintStream ps)
throws FileNotFoundException, BioException
{
BufferedReader br = new BufferedReader(new FileReader(filename));
// parse the Genbank file. damn that's easy.
System.err.println("reading "+filename);
SequenceIterator stream = SeqIOTools.readGenbank(br);
System.err.println("parsing "+filename);
while (stream.hasNext())
toFasta(stream.nextSequence(), ps);
}
/**
* Converts a {@link Sequence} into FASTA and prints it to ps.
*
* @param filename Genbank-formatted file to read from
* @param ps printstream to write to
*/
public void toFasta(Sequence seq, PrintStream ps)
throws IllegalAlphabetException
{
// log features in this sequences
if (this.logFeatures)
featureRecord(seq);
List genes = filter(seq, Genbank.FEATURE_GENE);
List repeats = new ArrayList();
// TODO: implement repeat removal
//if (this.removeRepeats)
// repeats = filter(seq, Genbank.FEATURE_REPEAT);
if (this.filterGenic)
displayGenic(seq, genes, ps);
else if (this.filterAll)
displayAll(seq, genes, repeats, ps);
else if (this.filterBg)
displayIntergenic(seq, genes, repeats, ps);
else if (this.filterUpstream)
displayUpstream(seq, genes, ps);
if (this.logLocations)
locationsDisplay(genes, System.err);
}
/**
* Filter certain features from the given {@link Sequence} and return
* them in an ordered list.
*
* @param seq sequence containing {@link Feature}s
* @param filter name of features to extract from seq
* @return Features sorted by location in seq
*/
public List filter(Sequence seq, String filter)
{
ArrayList list = new ArrayList();
// make a Filter for "gene" features and retrieve them
// features from D. Melanogaster CHR_2 include:
// CDS gene mRNA misc_RNA repeat_region rRNA snoRNA source tRNA
FeatureFilter ff = new FeatureFilter.ByType(filter);
FeatureHolder fh = seq.filter(ff);
// store all Features in fh so we can sort & print 'em later
for (Iterator i = fh.features(); i.hasNext(); )
list.add(i.next());
// sort the list to make it easier to parse out non-gene pieces
System.err.println("sorting " + list.size() + " " + filter + " features...");
Collections.sort(list, featureLocationComparator());
return list;
}
/**
* Extract genic sequences of this sequence. Gene features can occur
* on either strand although their positions are always recorded
* based on their offset from the 5' end of the + strand. Genes on the -
* strand are reported as the reverse complement of their reported
* positions, that is, as the bases on the - strand in the 5' to 3'
* direction.
*
* @param seq sequence containing list of gene Features
* @param list gene Features in this sequence
* @param ps stream to print to
*/
public void displayGenic(Sequence seq, List list, PrintStream ps)
{
StrandedFeature f = null; // currently iterated feature
String letters = null; // subsequence of current feature
for (Iterator i = list.iterator(); i.hasNext();)
{
f = (StrandedFeature) i.next();
letters = f.getSymbols().seqString();
ps.println(">"+f.getAnnotation().getProperty("gene"));
string2fasta(letters, ps);
}
}
/**
* Extract intergenic sequences of this sequence. Gene features can occur
* on either strand, but this algorithm treats all features as though
* they occur on the same (+) strand and only extracts the intergenic
* regions from this strand.
*
* @param seq sequence containing list of gene Features
* @param list gene Features in this sequence
* @param repeats repeat_region Features in this sequence
* @param ps stream to print to
*/
public void displayIntergenic(Sequence seq, List list, List repeats, PrintStream ps)
{
// intial location has NOTHING -- bio sequences use 1-based counting
Location previous = LocationTools.makeLocation(0, 0);
Location current = null;
// temp variable store offsets of current intergenic region
int beg = 0, end = 0;
for (Iterator i = list.iterator(); i.hasNext();)
{
current = ((Feature) i.next()).getLocation();
// start at 1 past last sub-sequence; end at 1 before current one
beg = previous.getMax() + 1;
end = current.getMin() - 1;
displayIntergenicHelper(beg, end, seq, ps, repeats);
// reset previous if this sequence ends after the previous one
if (current.getMax() > previous.getMax())
previous = current;
}
}
/**
* Extract genic and intergenic sequences of this sequence and display
* them. Gene features can occur
* on either strand although their positions are always recorded
* based on their offset from the 5' end of the + strand. Genes on the -
* strand are reported as the reverse complement of their reported
* positions, that is, as the bases on the - strand in the 5' to 3'
* direction.
*
* @param seq sequence containing list of gene Features
* @param list gene Features in this sequence
* @param repeats repeat_region Features in this sequence
* @param ps stream to print to
*/
public void displayAll(Sequence seq, List list, List repeats, PrintStream ps)
{
// intial location has NOTHING -- bio sequences use 1-based counting
Location previous = LocationTools.makeLocation(0, 0);
Location current = null;
// temp variable store offsets of current intergenic region
int beg = 0, end = 0;
Feature f = null;
for (Iterator i = list.iterator(); i.hasNext();)
{
f = (Feature) i.next();
current = f.getLocation();
// start at 1 past last sub-sequence; end at 1 before current one
beg = previous.getMax() + 1;
end = current.getMin() - 1;
// intergenic sequence
displayIntergenicHelper(beg, end, seq, ps, repeats);
// genic sequence
ps.println(">"+f.getAnnotation().getProperty("gene"));
string2fasta(f.getSymbols().seqString(), ps);
// reset previous if this sequence ends after the previous one
if (current.getMax() > previous.getMax())
previous = current;
}
}
/**
* format the region between beg and end (inclusive) as FASTA.
*
* @param beg first intergenic position
* @param end last intergenic position
* @param seq sequence containing this span
* @param ps stream to write to
* @param repeats list of {@link Feature}s labeled as repeat_region
*/
private void displayIntergenicHelper(int beg, int end, Sequence seq, PrintStream ps, List repeats)
{
/*
if (this.removeRepeats)
{
Location l = LocationTools.makeLocation(beg, end);
List list = recordRepeats(l, repeats);
Location repeat = null;
for (Iterator i = list.iterator(); i.hasNext();)
{
repeat = (Location) i.next();
// handle: rrrrrrr
// llllll
if (l.getMin() < repeat.getMin())
// handle: rrrrrrr
// llllll
if (l.getMax() > repeat.getMax())
}
}
*/
if (beg < end)
{
ps.println(">intergenic-" + beg + "-" + end);
// note: seq.subStr is 1-based and spans beg-end INCLUSIVE
string2fasta(seq.subStr(beg, end), ps);
}
else
{
System.err.println("intergenic overlap from "+beg+" to "+end);
}
}
/**
* Find Locations on a list that intersect the given location.
*
* @param l location which may have overlaps
* @param repeats ordered list of locations of repeat regions
*
* @return repeat locations overlapping l
*/
private List recordRepeats(Location l, List repeats)
{
ArrayList list = new ArrayList();
Location repeat = null;
for (Iterator i = repeats.iterator(); i.hasNext();)
{
repeat = (Location) i.next();
if (l.overlaps(repeat))
list.add(repeat);
if (repeat.getMin() > l.getMin())
break;
}
return list;
}
/**
* Extract regions upstream of the genic features in this sequence.
* Extractions are
* Gene features can occur
* on either strand although their positions are always recorded
* based on their offset from the 5' end of the + strand. Genes on the -
* strand are reported as the reverse complement of their reported
* positions, that is, as the bases on the - strand in the 5' to 3'
* direction.
*
* @param seq sequence containing list of gene Features
* @param list gene Features in this sequence
* @param ps stream to print to
*/
public void displayUpstream(Sequence seq, List list, PrintStream ps)
throws IllegalAlphabetException
{
RevCompSequence rcSeq = new RevCompSequence(seq);
// note the funny math here:
// because bio-string positions are 1-based but String.length() is
// 0-based, extracting sequences in rcSeq based on their offsets in
// seq is easy. given begin and end offsets on the + strand, the
// corresponding offsets on the - strand are (length - end) and
// (length - beg) for the beginning and end, respectively.
int seqLen = rcSeq.seqString().length();
StrandedFeature f = null; // currently iterated feature
String letters = null; // subsequence of current feature
for (Iterator i = list.iterator(); i.hasNext();)
{
f = (StrandedFeature) i.next();
int beg = f.getLocation().getMin() - this.filterUpstreamLength;
if (beg < 1)
beg = 1;
int end = f.getLocation().getMin() - 1;
if (f.getStrand().getValue() == Genbank.STRAND_POS)
letters = seq.subStr(beg, end);
else if (f.getStrand().getValue() == Genbank.STRAND_NEG)
letters = seq.subStr(seqLen - end, seqLen - beg);
ps.println(">"+f.getAnnotation().getProperty("gene"));
string2fasta(letters, ps);
}
}
/**
* Display location of feature on list.
*
* @param list gene Features in this sequence
* @param ps stream to print to
*/
public void locationsDisplay(List list, PrintStream ps)
{
// currently iterated feature
Feature f = null;
for (Iterator i = list.iterator(); i.hasNext();)
{
f = (Feature) i.next();
ps.println(">"+f.getAnnotation().getProperty("gene") + " " + f.getLocation());
}
}
/**
* Compare Location minimums.
*
* @return -1 if a is < b; 0 if a == b; 1 if a > b.
*/
private Comparator locationComparator()
{
return new Comparator() {
public int compare(Object o1, Object o2)
{
int l1 = ((Location) o1).getMin();
int l2 = ((Location) o2).getMin();
if (l1 > l2)
return 1;
else if (l1 == l2)
return 0;
else
return -1;
}
};
}
/**
* Compare Features' Location minimums.
*
* @return -1 if a is < b; 0 if a == b; 1 if a > b.
*/
private Comparator featureLocationComparator()
{
return new Comparator() {
public int compare(Object o1, Object o2)
{
Comparator c = locationComparator();
return c.compare(
((Feature) o1).getLocation(),
((Feature) o2).getLocation()
);
}
};
}
/**
* Pull feature types from this sequence so we can find out what types
* we can use as filters.
*
* @param s a Sequence to filter
*/
private void featureRecord(Sequence s)
{
if (this.features == null)
this.features = new HashSet();
for (Iterator i = s.features(); i.hasNext(); )
{
Feature f = (Feature) i.next();
this.features.add(f.getType());
}
}
/**
* FASTA format a string.
*
* @param s string to format
* @param ps stream to write to
*/
public void string2fasta(String s, PrintStream ps)
{
// entire sequence on one line
if (this.fastaLineLength < 1)
{
ps.println(s);
return;
}
int max = s.length();
for (int j = 0; j < max; j += this.fastaLineLength)
{
int eol = j + this.fastaLineLength; // end of current line
if (eol > max) eol = max; // end of last line
ps.println(s.substring(j, eol)); // print it
}
}
/**
* Display features which were captured with {@link #featureRecord}
*
* @param ps stream to write to
*/
public void featuresDisplay(PrintStream ps)
{
if (this.features == null)
return;
for (Iterator i = this.features.iterator(); i.hasNext();)
ps.println((String) i.next());
}
}
GenbankDriver.java
package edu.dartmouth.bglab.pipeline.util;
import org.apache.commons.cli.*;
/**
* Genbank file utility methods.
*
* A sample file is available here:
* http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html
*
* 2005-08-25, zak.burke@dartmouth.edu
*
*/
public class GenbankDriver
{
/** */
public BglabCommandLine cli = null;
/** command line options: input filename */
public static String INPUT = "i";
/** command line options: return all sequence substrings */
public static String FILTER_ALL = "p";
/** command line options: return only substrings of genes */
public static String FILTER_GENIC = "q";
/** command line options: return only non-gene substrings */
public static String FILTER_BG = "r";
/** command line options: return only non-gene substrings */
public static String FILTER_UP = "u";
/** command line options: length of sequence to extract upstream of gene features */
public static String UPSTREAM_LENGTH = "v";
/** command line options: skip repeats in FILTER_ALL and FILTER_BG */
public static String SKIP = "s";
/** command line options: reset fasta line length */
public static String LINE_LENGTH = "z";
/** command line options: display distinct features from input sequence */
public static String FEATURES = "f";
/** command line options: display genic locations from input sequence */
public static String LOCATIONS = "l";
/** command line options: show help */
public static String HELP = "h";
public static void main(String[] args)
{
GenbankDriver gd = new GenbankDriver();
Genbank g = new Genbank();
// available command line options
Options options = gd.optionsInit();
try
{
gd.init(args, options, g);
// input sequence filename
String file = gd.cli.value(GenbankDriver.INPUT);
g.toFasta(file, System.out);
// show features if requested
if (gd.cli.booleanValue(GenbankDriver.FEATURES))
g.featuresDisplay(System.err);
}
// command line parse error; show help and EXIT.
catch (ParseException e)
{
gd.usage(options, "ERROR: " + e.getMessage() + "\n", 1);
}
// file parsing error
catch (Exception e)
{
System.err.println(e.getMessage());
e.printStackTrace();
}
}
/**
* default constructor does nothing at all.
*
*/
public GenbankDriver()
{}
/**
* Parse command line options.
*
* @param args command line argument array
* @param options available options
*
* @return options as parse from the command line
*/
private void init(String[] args, Options options, Genbank g)
throws ParseException
{
// options as parsed from the command line
// throws error on missing option and missing option value
CommandLine cmd = new PosixParser().parse(options, args);
// ERROR: we can never get to this code if one of the required options
// is missing, which means that calling this application with just "-h"
// will always exit with a non-zero error code. I can't figure out
// how to short-circuit the parsing algorithm to check for help first,
// though it seems like there ought to be a way to do that.
// user needs help; show it and exit.
if (cmd.hasOption("h"))
usage(options, null, 0);
this.cli = new BglabCommandLine(cmd);
// not set options on t
try
{
if (this.cli.booleanValue(GenbankDriver.LINE_LENGTH))
g.fastaLineLength = this.cli.intValue(GenbankDriver.LINE_LENGTH);
}
catch (Exception e)
{
System.err.println("Couldn't parse " + GenbankDriver.LINE_LENGTH + " value; using default.");
}
// record distinct features
if (this.cli.booleanValue(GenbankDriver.FEATURES))
g.logFeatures = true;
// record distinct features
if (this.cli.booleanValue(GenbankDriver.LOCATIONS))
g.logLocations = true;
// select features to display
if (this.cli.booleanValue(GenbankDriver.FILTER_GENIC))
g.filterGenic = true;
else if (this.cli.booleanValue(GenbankDriver.FILTER_ALL))
g.filterAll = true;
else if (this.cli.booleanValue(GenbankDriver.FILTER_BG))
g.filterBg = true;
else if (this.cli.booleanValue(GenbankDriver.FILTER_UP))
g.filterUpstream = true;
// if FILTER_UP, check
if (this.cli.booleanValue(GenbankDriver.FILTER_UP))
if (this.cli.hasOption(GenbankDriver.UPSTREAM_LENGTH))
g.filterUpstreamLength = this.cli.intValue(GenbankDriver.UPSTREAM_LENGTH);
}
/**
* Explain how to use this program, then exit.
*
* @param options list of command line options
* @param message error message, if any
* @param exitCode error code in case we got here from an Exception
*/
protected void usage(Options options, String message, int exitCode)
{
if (message != null)
System.err.println(message);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "Genbank", options );
System.exit(exitCode);
}
/**
* Configure options which can be passed in on the command line.
*
* @return list of options
*/
protected Options optionsInit()
{
Option o = null;
Options options = new Options();
// input: name of genbank file to read
o = new Option(GenbankDriver.INPUT, "input", true, "name of file to read");
o.setArgName("gbk-file");
o.setRequired(true);
options.addOption(o);
// filter what features to display
OptionGroup g = new OptionGroup();
g.setRequired(true);
g.addOption(new Option(GenbankDriver.FILTER_ALL, "filter-all", false, "return all sequence subsets"));
g.addOption(new Option(GenbankDriver.FILTER_GENIC, "filter-genic", false, "return gene features only"));
g.addOption(new Option(GenbankDriver.FILTER_BG, "filter-bg", false, "return non-gene features only"));
g.addOption(new Option(GenbankDriver.FILTER_UP, "filter-up", false, "return sequences upsream of gene features only"));
options.addOptionGroup(g);
// locations: whether to display genic locations in input sequence
options.addOption(GenbankDriver.LOCATIONS, "locations", false, "display genic locations in input sequence");
// output line length
o = new Option(GenbankDriver.LINE_LENGTH, "line-length", true, "line length of output; 0 for single line");
o.setArgName("line-length");
options.addOption(o);
// output line length
o = new Option(GenbankDriver.UPSTREAM_LENGTH, "upstream-length", true, "length of sequence to extract upstream of gene features");
o.setArgName("line-length");
options.addOption(o);
// features: whether to display the distinct features in input sequence
options.addOption(GenbankDriver.FEATURES, "features", false, "display distinct features in gbk file");
// locations: whether to display genic locations in input sequence
options.addOption(GenbankDriver.LOCATIONS, "locations", false, "display genic locations in input sequence");
// help: whether to display options and quit
options.addOption(GenbankDriver.HELP, "help", false, "display these instructions");
return options;
}
}