#!/usr/bin/perl # A script to calculate the hit rate of papers before and after a # journal reference was added to them. # Author: Ian Hickman use strict; use Time::Local; my %months=( "Jan", "00", "Feb", "01", "Mar", "02", "Apr", "03", "May", "04", "Jun", "05", "Jul", "06", "Aug", "07", "Sep", "08", "Oct", "09", "Nov", "10", "Dec", "11" ); my ( $startoflogs, $endoflogs ); my ( %firstpublished, %lengthbeforejr, %lengthafterjr ); my ( %hitsbeforejr, %hitsafterjr, %jrdate, %normaldate); my ($total, $papercount, $hitcount)=(0,0,0); openchanges(); print STDERR "Total: $total\n"; while( ){ my ($paper, $time, @field); my ($sec, $min, $hour, $day, $month, $monthtext, $year); @field=split; # use diffinjournal < alldownloads # get the date (and time) of the hit $_=$field[3]; ($day, $monthtext, $year, $hour, $min, $sec)=/.(\d\d).(\w+).(\d{4}).(\d\d).(\d\d).(\d\d)/; $month=$months{$monthtext}; $time=timelocal( $sec, $min, $hour, $day, $month, $year ); $papercount++; # if the paper that was downloaded is in the hash table $_=$field[6]; ($paper)=/.*\/(.*\/\d{7}$)/; if( $firstpublished{$paper} ){ # is the paper valid? if( $time>$firstpublished{$paper} ){ $hitcount++; print STDERR "papers: $papercount hits: $hitcount\n"; # this is our paper, but is the hit before of after the journal ref? if( $time<$jrdate{$paper} ){ $hitsbeforejr{$paper}++; } else { $hitsafterjr{$paper}++; } # end if $time<$jrdate } # end if this is our paper } # end if in table } # end while my $paper; foreach $paper ( keys ( %firstpublished ) ){ if( $hitsbeforejr{$paper} ne "0" || $hitsafterjr{$paper} ne "0" ){ $hitsbeforejr{$paper}; $hitsafterjr{$paper}; print"$lengthbeforejr{$paper} "; print"$hitsbeforejr{$paper} "; print"$lengthafterjr{$paper} "; print"$hitsafterjr{$paper}\n"; } } sub openchanges{ open( PAPERLIST, "papswithjourns" ) || die "Cant open list of papers: $!"; my ($datejournalref); my ($day, $month, $year, $datepublished); # calculate the start and end times we have logs for # All months -1. $startoflogs=timelocal( 0, 0, 0, 24, 7, 1999 ); $endoflogs=timelocal( 0, 0, 0, 9, 4, 2000 ); # loop through all the papers while( defined( $_ = )){ my @field; # paperlist line format is: # $field[0]="paper reference" # $field[1]="submission date in last 6 months" # $field[2]="date journal ref was added" @field=split; # get the date that a journal ref was added $_=$field[2]; ($year, $month, $day)=/(....)(..)(..)/; $datejournalref=timelocal( 0, 0, 0, $day, $month-1, $year ); # check the the date the journal ref was added is within the boundarys if( $datejournalref<$endoflogs && $datejournalref>$startoflogs ){ $total++; $jrdate{$field[0]}=$datejournalref; $normaldate{$field[0]}=$field[2]; # calculate the date first published $_=$field[1]; ($year, $month, $day)=/(....)(..)(..)/; $datepublished=timelocal( 0, 0, 0, $day, $month-1, $year ); if( $datepublished<$startoflogs ){ $datepublished=$startoflogs; } #endif # set some global hash table values $firstpublished{$field[0]}=$datepublished; $lengthbeforejr{$field[0]}=$datejournalref-$datepublished; $lengthafterjr{$field[0]}=$endoflogs-$datejournalref; $hitsbeforejr{$field[0]}=0; $hitsafterjr{$field[0]}=0; } # endif } # endwhile close( PAPERLIST ) || die "Cant close paper list: $!"; }