Friday, September 25, 2009

How to colorize any text based output

I found this little tidbit the other day. Very useful for highlighting simulation output warnings and errors:

http://winterstorm.ca/hilite

Wednesday, September 23, 2009

Extract prices from craiglist

#!/usr/bin/perl
# This is a script that will help me search craigslist and extract
# price information and average the data to get what the going
# price of an item is
#
# Example:
#; ~/bin/cl_get_prices.pl -string=2006 -string=maxima
#; ~/bin/cl_get_prices.pl -string=2003 -string=maxima -cities=dallas
#; ~/bin/cl_get_prices.pl -string=2003 -string=maxima -verb=4
#; ~/bin/cl_get_prices.pl -string=ps3 -string=80gb -verb=4
#; ~/bin/cl_get_prices.pl -string=ps3 -string=80gb -trim=.333
#; ~/bin/cl_get_prices.pl -string=ps3 -string=60gb -trim=.2 -filter_on_title=1
#
# I think I am going to give three outputs. Mean, Median and Trimmed Mean.
# This way if there is outlier data it will be easy to spot in the Mean
# value.

use strict;
use Getopt::Long;
use List::Util qw(sum);
use Statistics::Descriptive;
use List::MoreUtils qw(uniq);

my @item_prices;
my $delay_between_searches = 5;

# Define the arguments
my %opts = ();
&usage unless &GetOptions(\%opts, 'help'
,'strings=s@'
,'cities=s@'
,'trim=s'
,'filter_on_title=s'
,'min=s'
,'max=s'
,'verbosity=i'
);
if ( @{opts{strings}} eq 0 ) { die " NO SEARCH STRING WAS GIVEN"; }
unless ( exists ${opts{cities}} ) {
push( @{$opts{cities}}, 'austin' );
}

$opts{trim} = .25 unless ( exists $opts{trim} );
$opts{min} = "" unless ( exists $opts{min} );
$opts{max} = "" unless ( exists $opts{max} );
$opts{verbosity} = 3 unless ( exists $opts{verbosity} );

foreach my $city ( @{$opts{cities}} ) {
&get_item_prices( \@item_prices, \@{$opts{strings}}, $city, $opts{min}, $opts{max}, $opts{filter_on_title} );
sleep( $delay_between_searches );
}

@item_prices = sort {$a <=> $b } @item_prices;
@item_prices = uniq @item_prices;
my $stat = Statistics::Descriptive::Full->new();
$stat->add_data(@item_prices);
$Statistics::Descriptive::Tolerance = 1e-10;
my $mean = $stat->mean();
my $var = $stat->variance();
my $tm = $stat->trimmed_mean($opts{trim});
my $std = $stat->standard_deviation();

print "Prices: @item_prices\n";
printf "Statistics: Mean: %10.4f Var: %10.4f Std Dev: %10.4f Trimmed mean: %10.4f\n", $mean, $var , $std , $tm ;
print "". scalar( @item_prices ) . " ITEMS WERE FOUND\n";
print "lowest:\t\t$item_prices[0]\n";
print "highest:\t$item_prices[$#item_prices]\n";
print "Mean:\t\t" . sum( @item_prices )/@item_prices ."\n";
print "Median:\t\t" . $item_prices[@item_prices/2] ."\n";
print "Trim Mean:\t" . $tm ."\n";

sub get_item_prices {
my ( $item_prices_ref, $strings_ref, $city, $min, $max, $filter_on_title ) = @_;
my $wget="http://$city.craigslist.org/search/sss?query=";
my $str_cat = "";
foreach my $str ( @$strings_ref ) {
$wget .= "$str_cat$str";
$str_cat = "+";
}
my $html = `wget -q -O - $wget`;
print $html . "\n" if $opts{verbosity} > 6;
my @chunks = split /(<\/?p>\s*)+/,$html;

# Filter out everything but the titles
my $num_chunks = scalar @chunks;
for ( my $idx = $num_chunks-1; $idx >= 0; $idx-- ) {
delete $chunks[$idx] unless $chunks[$idx] =~ m/^......\s-\s/;
print "$idx|$chunks[$idx]\n" if $opts{verbosity} > 4;
}

# Clean up the title and do additional filtering on the title
my $num_chunks = scalar @chunks;
for ( my $idx = $num_chunks-1; $idx >= 0; $idx-- ) {
my $rm_item = 0;
$chunks[$idx] =~ s/^.+html">//;
$chunks[$idx] =~ s/>//gs;
$chunks[$idx] =~ s/<//gs;
$chunks[$idx] =~ s/ / /gs;
$chunks[$idx] =~ s/<[^>]*>//gs;
if ( $filter_on_title eq "1" ) {
foreach my $str ( @$strings_ref ) {
if ( $chunks[$idx] !~ m/$str/i ) {
$rm_item = 1;
}
}
}
delete $chunks[$idx] unless $rm_item == 0;
}

# Extract price information from the titles
@chunks = sort @chunks;
foreach my $chunk (@chunks) {
if ( $chunk ne "" ) {
print " $chunk\n" if $opts{verbosity} > 3;
my @words = split(' ', $chunk);
foreach my $word (@words) {
if ( $word =~ m/^\$/) {
$word =~ s/(\$|,)//g ;
if ( $word =~ m/^\d+$/ ) {
push(@$item_prices_ref, $word);
}
}
}
}
}

}