#!/usr/bin/perl # tests if papers really do follow the model download # Usage: decay3cite < ../db/nodupdloads # Author: Ian Hickman use strict; use Time::Local; my (%citedfrom, %citedto, %submit, %citing, %firstcite, %secondcite, %thirdcite ); my (%before, %mid1, %mid2, %after, @field); my %months=( "Jan", 0, "Feb", 1, "Mar", 2, "Apr", 3, "May", 4, "Jun", 5, "Jul", 6, "Aug", 7, "Sep", 8, "Oct", 9, "Nov", 10, "Dec", 11 ); # open list of papers with one cite to them open( FILE, "../db/3cite" ) || die"cant open citation data $!"; while( ){ @field=split; $citedfrom{$field[1]}.=$field[0]." "; $citedto{$field[0]}=$field[1]; } close( FILE ) || die"cant close citation data $!"; # open first submission dates for all papers # if the paper was submitted after 24.Aug.99 # get submission date and dates of citations open( FILE, "../../d_firstsubmit" ) || die"cant open firstsubmit $!"; my $start=timelocal( 0, 0, 0, 23, 7, 1999 ); my $end=timelocal( 0, 0, 0, 10, 4, 2000 ); while( ){ @field=split; $_=$field[0]; my($year, $month, $day)=/(....)(..)(..)/; if( $month>0 ){ my $time=timelocal( 0, 0, 0, $day, $month-1, $year ); # is this paper in our database of papers with 2 citations? # if it is get the date of its first submission if( exists( $citedfrom{$field[2]} ) && $time > $start && $time < $end ){ $submit{$field[2]}=$time; } # is this paper one of the citing papers? # if it is get the day it was submitted if( exists( $citedto{$field[2]} ) && $time > $start && $time < $end ){ $citing{$field[2]}=$time; } } } close( FILE ) || die"cant close $!"; # now loop through all deposited papers to get the days # of the first and second cites my $key; foreach $key ( keys ( %citedfrom )){ $_=$citedfrom{$key}; my @twopapers=split; # check the citing papers have first submission days within the time boundaries if( exists( $citing{$twopapers[0]} ) && exists( $citing{$twopapers[1]} ) && exists( $citing{$twopapers[2]} ) ){ # check that the first cite is actually the first cite if( $citing{$twopapers[0]} <= $citing{$twopapers[1]} && $citing{$twopapers[1]} <= $citing{$twopapers[2]} ){ # 1 2 3 $firstcite{$key}=$citing{$twopapers[0]}; $secondcite{$key}=$citing{$twopapers[1]}; $thirdcite{$key}=$citing{$twopapers[2]}; } elsif( $citing{$twopapers[0]} <= $citing{$twopapers[2]} && $citing{$twopapers[2]} <= $citing{$twopapers[1]} ){ # 1 3 2 $firstcite{$key}=$citing{$twopapers[0]}; $thirdcite{$key}=$citing{$twopapers[1]}; $secondcite{$key}=$citing{$twopapers[2]}; } elsif( $citing{$twopapers[1]} <= $citing{$twopapers[0]} && $citing{$twopapers[0]} <= $citing{$twopapers[2]} ){ # 2 1 3 $secondcite{$key}=$citing{$twopapers[0]}; $firstcite{$key}=$citing{$twopapers[1]}; $thirdcite{$key}=$citing{$twopapers[2]}; } elsif( $citing{$twopapers[1]} <= $citing{$twopapers[2]} && $citing{$twopapers[2]} <= $citing{$twopapers[0]} ){ # 2 3 1 $thirdcite{$key}=$citing{$twopapers[0]}; $firstcite{$key}=$citing{$twopapers[1]}; $secondcite{$key}=$citing{$twopapers[2]}; } elsif( $citing{$twopapers[2]} <= $citing{$twopapers[0]} && $citing{$twopapers[0]} <= $citing{$twopapers[1]} ){ # 3 1 2 $secondcite{$key}=$citing{$twopapers[0]}; $thirdcite{$key}=$citing{$twopapers[1]}; $firstcite{$key}=$citing{$twopapers[2]}; } else { # 3 2 1 $thirdcite{$key}=$citing{$twopapers[0]}; $secondcite{$key}=$citing{$twopapers[1]}; $firstcite{$key}=$citing{$twopapers[2]}; } } } # main loop while( ){ @field=split; my ($paperfield, $papernum); ($papernum)=/(\d{7})/; if( $field[6]=~/papers/ ){ ($paperfield)=/\/ftp\/([\w-]+)\.?.*\/papers/; } else { ($paperfield)=/\/.*\/([\w-]+)\.?.*\/\??\d{7}/; } # $paper is the unique id of the paper my ($paper)=$paperfield."/".$papernum; if( exists( $submit{$paper} ) && exists( $firstcite{$paper} ) && $firstcite{$paper}>$submit{$paper} ){ # get the date of the hit (nearest day) $_=$field[3]; my ($day, $monthtext, $year)=/^.(\d\d).(\w+).(\d{4}).*/; my $month=$months{$monthtext}; my $hittime=timelocal( 0, 0, 0, $day, $month, $year ); if( $hittime >= $submit{$paper} && $hittime < $firstcite{$paper} ){ my $diff=roundoff( ($hittime-$submit{$paper})/(3600*24) ); $before{$diff}++; } elsif( $hittime >= $firstcite{$paper} && $hittime < $secondcite{$paper} ){ my $diff=roundoff( ($hittime-$firstcite{$paper})/(3600*24) ); $mid1{$diff}++; } elsif( $hittime >= $secondcite{$paper} && $hittime < $thirdcite{$paper} ){ my $diff=roundoff( ($hittime-$secondcite{$paper})/(3600*24) ); $mid2{$diff}++; } elsif( $hittime > $thirdcite{$paper} ){ my $diff=roundoff( ($hittime-$thirdcite{$paper})/(3600*24) ); $after{$diff}++; } } } my $time; open( FILE, ">d_3citebefore" ) || die"cant open before $!"; foreach $time (keys (%before)){ print FILE "$time $before{$time}\n"; } close( FILE )|| die"cant close before $!"; system "sort -n d_3citebefore > d_3citebefore2"; system "zeropad < d_3citebefore2 > d_3citebefore"; system "rm d_3citebefore2"; open( FILE, ">d_3citemid1" ) || die"cant open mid1 $!"; foreach $time (keys (%mid1)){ print FILE "$time $mid1{$time}\n"; } close( FILE )|| die"cant close mid1 $!"; system "sort -n d_3citemid1 > d_3citemid12"; system "zeropad < d_3citemid12 > d_3citemid1"; system "rm d_3citemid12"; open( FILE, ">d_3citemid2" ) || die"cant open mid2 $!"; foreach $time (keys (%mid2)){ print FILE "$time $mid2{$time}\n"; } close( FILE )|| die"cant close mid2 $!"; system "sort -n d_3citemid2 > d_3citemid22"; system "zeropad < d_3citemid22 > d_3citemid2"; system "rm d_3citemid22"; open( FILE, ">d_3citeafter" ) || die"cant open after $!"; foreach $time (keys (%after)){ print FILE "$time $after{$time}\n"; } close( FILE )|| die"cant close after $!"; system "sort -n d_3citeafter > d_3citeafter2"; system "zeropad < d_3citeafter2 > d_3citeafter"; system "rm d_3citeafter2"; sub roundoff{ my ($num)=@_; if( int($num) eq $num ){ return $num; } elsif( ($num-int($num))<0.5 ){ return int($num); } else { return (int($num)+1); } }