1. Take the next_seq and write_seq subroutines that you wrote in the previous lab, and place them into a Package called IGSPClass::SeqPackage. You should modify write_seq to take a variable which holds the output handle as an argument, and write to that instead of the default.
Answer:
package IGSPClass::SeqPackage;
use Exporter;
use vars qw/@ISA @EXPORT/;
@ISA = qw/Exporter/;
@EXPORT = qw/next_seq write_seq/;
sub next_seq {
my $handle = shift;
# this will hold our current record
my $curr_record; # this must start undef, so that it returns undef on EOF
# this is a state variable that is set to true after the SQ line is encountered
# which tells the line processecor to process sequence entry lines, since there
# is no identifiable pattern that can be matched for them
my $inseq;
LINE: while (my $line = <$handle>) {
chomp $line;
next if ($line eq 'XX'); # these are spacer lines
if ($line =~ m/^ID/) {
# harvest the id
my $id = (split /\s+/, $line)[1];
$id =~ s/\;$//;
$curr_record->{id} = $id;
}
elsif ($line =~ m/^AC/) {
# harvest the accession
my $acc = (split /\s+/, $line)[1];
$acc =~ s/\;$//;
$curr_record->{accession} = $acc;
}
elsif ($line =~ m/^RL\s+(.*)/) {
# harvest the RL line
my $rl = $1;
# note, we could push this
# onto an arrayref keyed to rl, as well
$curr_record->{rl} .= ' '.$rl; # prepend it with a space
}
elsif ($line =~ m/^FT/) {
# there are several of these that we are interested in
if ($line =~ m/\/mol_type\=(.*)/) {
# harvest the mol_type
my $mol_type = $1;
$mol_type =~ s/\"//g;
$curr_record->{mol_type} = $mol_type;
}
elsif ($line =~ m/\/chromosome\=(.*)/) {
# harvest the chromosome
my $chromosome = $1;
$chromosome =~ s/\"//g;
$curr_record->{chromosome} = $chromosome;
}
elsif ($line =~ m/\/organism\=(.*)/) {
# harvest the organism
my $organism = $1;
$organism =~ s/\"//g;
$curr_record->{organism} = $organism;
}
else {
# skip all other FT lines
next LINE;
}
}
elsif ($line =~ m/^SQ\s+(.*)/) {
# harvest the sequence bp info
my $seq_info = $1;
# there are other ways of doing the following, this is just one way
my @parts = split /\;\s+/, $seq_info; # note the split pattern takes care of whitespace between records
$curr_record->{total_bp} = (split /\s+/, $parts[0])[2];
$curr_record->{num_a} = (split /\s+/, $parts[1])[1];
$curr_record->{num_c} = (split /\s+/, $parts[2])[1];
$curr_record->{num_g} = (split /\s+/, $parts[3])[1];
$curr_record->{num_t} = (split /\s+/, $parts[4])[1];
# reset the internal state of the program to know that it is
# harvesting sequence bps for future lines until the state gets reset
$inseq = 1;
}
elsif ($inseq) {
if ($line eq '//') {
# note that because of the self-contained nature of the subroutine, each time it is called
# the $inseq state variable is set to undefined, so we dont need to worry about resetting it
# manually here anymore.
last LINE;
}
else {
$line =~ s/^\s+//; # get rid of leading whitespace
# there are other ways of doing the following, this is just one way
my @parts = split /\s+/, $line;
pop @parts; # throw away the bp count
$curr_record->{sequence} .= join('', @parts); # concatenation into one line
}
}
else {
# skip all other lines
next LINE;
}
}
return $curr_record;
}
sub write_seq {
my $handle = shift;
my $record = shift;
# this is called a heredoc
# it prints everything from the next line to the string (I use EOR, but this is arbitrary) occurring
# by itself, on a line of the code. If the String is double quoted, variables are interpolated.
# If it is single quoted, variables are not interpolated.
# It is a redimentary form of templating. For more powerful templating, read up on the Template Toolkit,
# which is contained in the CPAN 'Template' Package.
print $handle <<"EOR";
record_start
ID $record->{id}
Accession $record->{accession}
RL $record->{rl}
Molecular Type: $record->{mol_type}
Organism: $record->{organism}
Chromosome: $record->{chromosome}
Number of Base Pairs: $record->{total_bp}
Number of As: $record->{num_a}
Number of Cs: $record->{num_c}
Number of Gs: $record->{num_g}
Number of Ts: $record->{num_t}
sequence_start
$record->{sequence}
sequence_end
record_end
EOR
}
1;
#!/usr/bin/perl
use strict; # this will always help you
use IGSPClass::SeqPackage;
# we could just work on <>, but this is more helpful to the user
my $embl_file = shift or die "embl_processor.pl path_to_embl_file\n";
# open the filehandle into a scalar variable
# this will become more useful later on in the class
open (my $embl, "<".$embl_file) or die "Couldnt open $embl_file $!\n";
my $output = *STDOUT;
while (my $seq = &next_seq($embl)) {
&write_seq($output, $seq);
}
close $embl;
exit;
Here are some unix hints to get you started:
a. make a directory in your home directory called perl_class_lib.
b. add /home/<your username>/perl_class_lib to your PERL5LIB environment variable using the bash syntax for this
c. create your Package under that directory, e.g. first make a subdirectory of perl_class_lib called IGSPClass, then
create the SeqPackage.pm file in perl_class_lib/IGSPClass.
Here is some psuedocode for SeqPackage.pm:
declare package IGSPClass::SeqPackage
use Exporter;
inherit from Exporter;
setup next_seq and write_seq to be exported
define the next_seq subroutine
define the write_seq subroutine
make sure it takes the output handle as an argument, along with the sequence to be written
1;
Here is some psuedocode for your script:
use IGSPClass::SeqPackage;
get the path to the file into a variable, or die with usage
open the path to the file into a variable
my $output = *STDOUT; # this will place the STDOUT into $output
while (seq <- next_seq(input)) {
write_seq($output, seq)
}
close the input handle
exit;
2. Use your IGSPClass::SeqPackage package to read in the sequences from the file, and write out to separate files named by molecular type.
The following perl code may come in Handy.
ANSWER:
#!/usr/bin/perl
use strict; # this will always help you
use IGSPClass::SeqPackage;
use IO::File;
# we could just work on <>, but this is more helpful to the user
my $embl_file = shift or die "embl_processor.pl path_to_embl_file\n";
# open the filehandle into a scalar variable
# this will become more useful later on in the class
open (my $embl, "<".$embl_file) or die "Couldnt open $embl_file $!\n";
my %mol_type_handles = (); # this will hold IO::File handles for each mol_type
while (my $seq = &next_seq($embl)) {
my $mol_type = $seq->{'mol_type'};
# only open the handle if it has not been opened already
unless (exists $mol_type_handles{$mol_type}) {
$mol_type_handles{$mol_type} = IO::File->new;
$mol_type_handles{$mol_type}->open('>'.$mol_type.'_sequences.out') or die $!;
}
&write_seq($mol_type_handles{$mol_type}, $seq);
}
close $embl;
map { $mol_type_handles{$_}->close } keys %mol_type_handles;
exit;
use IO::File;
my %outputs;
while (my $seq = next_seq($input)) {
my $mol_type = $seq->{'molecular_type'};
unless (exists $outputs{$mol_type}) {
$outputs{$mol_type} = new IO::File;
$outputs{$mol_type}->open('>'.$mol_type.'.out') or die $!;
}
write_seq($outputs{$mol_type}, $seq);
}
Comments (0)
You don't have permission to comment on this page.