G::Tools
Literature
G::Tools::Literature - Perl extension for blah blah blah
|
Globals (from use vars definitions) |
@EXPORT |
$VERSION |
@EXPORT_OK |
use G::Tools::Literature; blah blah blah
|
Stub documentation for G::Tools::Literature was created by h2xs. It looks like the author of the extension was negligent enough to leave the stub unedited.
Blah blah blah.
|
BEGIN | | Code |
DESTROY | No description | Code |
KeySearch | No description | Code |
PDFtoTEXT | No description | Code |
PubMedSearch | No description | Code |
WordCount | No description | Code |
new | No description | Code |
Methods description
Methods code
BEGIN
{ eval "use LWP::Simple;";
if($@){ warn "$@" };
eval "use LWP::UserAgent;";
if($@){ warn "$@" };
eval "use HTTP::Cookies;";
}
sub DESTROY
{ my $self = shift;
}
sub KeySearch
{ &opt_default();
my @args=opt_get(@_);
my $dir=shift @args;
my $key=shift @args;
my @files;
my @txts;
my %hash;
my $i;
my $pdf;
my $txt;
my $tmp;
opendir DIR, $dir;
@files=readdir DIR;
open(KEY,">$dir/$dir".'.key');
foreach(@files){
next if($_ eq '.' || $_ eq '..');
next if(/\.log$/);
next if(/\.key$/);
if(/\.pdf/){
$tmp=PDFtoTEXT("$dir/$_");
push(@txts,$tmp);
$pdf++;
}
else{
$tmp="$dir/$_";
push(@txts,$tmp);
$txt++;
}
}
foreach $tmp (@txts){
$hash{$tmp}=WordCount($tmp, $key);
$i++;
}
print KEY '**************************************************************'."\n".'**** Key Search (1.00) Key Word Count from PDF Documents ****'."\n".'**************************************************************'."\n\n";
print KEY "Key: $key\n";
print KEY "Directory: $dir\n";
print KEY "Paper: $i\( PDF: $pdf files TXT: $txt files\)\n";
print KEY "---------------------------------\n\n";
foreach(sort{$hash{$b} <=> $hash{$a}}keys(%hash)){
$tmp=substr($_, index($_,'/')+1);
print KEY "$tmp: $hash{$_}\n";
}
close(KEY);
return $i;
}
sub PDFtoTEXT
{ &opt_default();
my @args=opt_get(@_);
my $pdf=shift @args;
system('pdftotext '."$pdf");
$pdf=~s/\.pdf$/\.txt/;
return $pdf;
}
sub PubMedSearch
{ my $time=time;
&opt_default(limit=>500, dir=>'PUBMED'.$time, key=>'');
my @args=opt_get(@_);
my $query=shift @args;
my $limit=opt_val('limit');
my $dir=opt_val('dir');
my $key=opt_val('key');
my $com;
my @date;
my $req;
my $res;
my $ua;
my $i;
my $frag;
my $frag2;
my @line;
my @line2;
my @line3;
my $url;
my $url2;
my $tmp;
my $tmp2;
my $head;
my $abst;
my $abstract;
my $title;
my $authors;
my $affiliation;
my $journal;
my $pmid;
my $download=0;
my $invalid=0;
my $permit=0;
my $nopdf=0;
my $notfound=0;
$tmp=-d "$dir";
if($tmp == 1){
print "\"$dir\" : The directory has already existed.\n";
return;
}
@date=localtime($time);
$date[5]=$date[5]+1900;
$date[4]=$date[4]+1;
print '**************************************************************'."\n".'** PubMed Search (1.00) Automatic Paper Acquisition System **'."\n".'**************************************************************'."\n\n";
print "Date: $date[5]\/$date[4]\/$date[3] $date[2]\:$date[1]\:$date[0]\n";
print "Query word is\" $query\".\n";
print "Search limit is\" $limit\".\n";
print "Key word is\" $key\".\n" if($key);
print "---------------------------------\n";
mkdir("$dir",0777);
open(LOG, ">$dir/$dir".'.log');
print LOG '**************************************************************'."\n".'** PubMed Search (1.00) Automatic Paper Acquisition System **'."\n".'**************************************************************'."\n\n";
print LOG "Date: $date[5]\/$date[4]\/$date[3] $date[2]\:$date[1]\:$date[0]\n";
print LOG "Query: $query\n";
print LOG "Limit: $limit\n";
print LOG "Key: $key\n" if($key);
print LOG "Directory: $dir\n";
print LOG "Log file: $dir\/$dir\.log\n";
print LOG "Key file: $dir\/$dir\.key\n" if($key);
print LOG "---------------------------------\n\n";
$query =~ tr/ /+/;
$ua = LWP::UserAgent->new;
$ua->cookie_jar(HTTP::Cookies->new(file => "lwpcookies.txt", autosave => 1));
$com='http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?SUBMIT=y&DB=PubMed&cmd=&term='.$query.'&dispmax='.$limit;
$req = HTTP::Request->new(GET => $com);
$res = $ua->request($req);
unless($res->is_success){
print "Error occured: PubMed isn't available.\n";
print "---------------------------------\n";
return;
}
@line=split(/\n/,$res->as_string);
foreach(@line){
if(/\<td width\=\"100\%\"\>\<font size\=\"\-1\"\>\<a href\=\"(.*)\"\>.*/){
$i++;
$frag = 0;
$frag2 = 0;
$journal = "";
$title = "";
$authors = "";
$affiliation = "";
$abstract = "";
$pmid = "";
$url=$1;
$url =~ s/amp\;//g;
$req = HTTP::Request->new(GET => "$url");
$abst = $ua->request($req);
########################################################
## Abstract #
########################################################
if($abst->content =~ /\<input name\=\"uid\" type\=\"checkbox\" value\=\"\d+\"\>\<b\>1\: \<\/b\>(.*)\<\/td\>/){
$journal=$1;
if($journal =~ /Error occured\: cannot get document summary/){
$journal = 'Error occured! cannot get document summary';
print "$i\.txt: Not found $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n";
close(TXT);
print LOG "\[PAPER $i\]\n";
print LOG "Journal: $journal\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not found the abstract.\n";
print LOG "---------------------------------\n\n";
$notfound++;
$frag = 1;
$frag = 2;
next;
}
}
if($abst->content =~ /\<br\>\<font size\=\"\+1\"\>\<b\>(.*)\<\/b\>\<\/font\>\<br\>\<br\>\<b\>(.*)\<\/b\>\<br\>\<br\>(.*)\<br\>\<br\>(.*)\<br\>\<br\>(PMID\: .*)\<\/dd\>\n/){
$title=$1;
$authors=$2;
$affiliation=$3;
$abstract=$4;
$pmid=$5;
}
elsif($abst->content =~ /\<br\>\<font size\=\"\+1\"\>\<b\>(.*)\<\/b\>\<\/font\>\<br\>\<br\>\<b\>(.*)\<\/b\>\<br\>\<br\>(.*)\<br\>\<br\>(.*)(PMID\: .*)\<\/dd\>\n/){
$title=$1;
$authors=$2;
$affiliation=$3;
$abstract=$4;
$pmid=$5;
}
print LOG "\[PAPER $i\]\n";
print LOG "Journal: $journal\n";
print LOG "Title: $title\n";
print LOG "Authors: $authors\n";
print LOG "$pmid\n";
@line2=split(/\n/,$abst->content);
foreach(@line2){
if(/\<dd\>\<SPAN\>\<a href\=\"(.*)\" OnClick.*/){
$frag2 = 1;
$url=$1;
$url='http://www.ncbi.nlm.nih.gov:80'.$url;
########################################################
## Direct Access #
########################################################
if($url =~ /\.pdf$/){
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url. Only abstract is saved.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
last;
}
########################################################
## Access to the site #
########################################################
$url =~ s/amp\;//g;
$url =~ s/amp\%3[Bb]//g;
$req = HTTP::Request->new(HEAD => "$url");
$head = $ua->request($req);
$url = $head->{_request}->{_uri};
$url =~ s/amp\;//g;
$url =~ s/amp\%3[Bb]//g;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
unless($res->is_success){
print "$i\.txt: Not found $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not found the page. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$notfound++;
$url="not found";
$frag = 1;
}
########################################################
## Springer #
########################################################
if($url =~ /\.springer\./){
my $spfrag;
$url =~ s/index.html//;
@line3=split(/\n/, $res->content);
foreach(@line3){
if(/\<frame title\=\"Navigation\" name\=\"nav\" src\=\"(.*)\".*/){
$url2=$url.$1;
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
last;
}
elsif(/Otherwise click \<a href\=\"\.\.\/\.\.(.*)\"\>here\!\<\/a\>\<\/p\>/){
$tmp = $1;
$url =~ s/(.*\/journals\/\d+)\/.*/$1$tmp/;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
$url =~ s/index.html//;
@line2=split(/\n/, $res->content);
foreach(@line2){
if(/\<frame title\=\"Navigation\" name\=\"nav\" src\=\"(.*)\".*/){
$url2=$url.$1;
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
last;
}
}
last;
}
elsif(/\<a href\=\"\.\.\/\.\.(.*\.pdf)\"\>PDF/ || /\<a HREF\=\"\.\.\/\.\.(.*\.pdf)\"\>Article in PDF format/){
$spfrag =1;
$tmp=$1;
$url =~ s/(.*\/journals\/\d+)\/.*/$1$tmp/;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
last;
}
}
}
last if($spfrag == 1);
if($res->content =~ /.*\"(.*\.pdf)\".*/){
$url2 = $url.$1;
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url2.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url2\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url2 was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url2\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## Springer-ny #
########################################################
if($url =~ /\.springer-ny\./){
$url =~ s/index\.html//;
$url2=$url.'paper/index.html';
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url2.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url2\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url2 was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url2\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
########################################################
## Catchword #
########################################################
if($url =~ /\.catchword\./){
my $catchword;
if($res->content =~ /SRC\=\"(.*)\" NAME\=\"toolbar\"/){
$url='http://www.catchword.com/'.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
@line3=split('\n', $res->content);
foreach(@line3){
if($res->content =~ /\<a href\=\"(.*)\"alt\=\"full document\"/){
$catchword=1;
$url='http://www.catchword.com/'.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
if($catchword == 0){
$frag = 1;
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
}
}
}
########################################################
## InterScience #
########################################################
elsif($url =~ /\.interscience\./){
if($res->as_string =~ /.*\"(.*\.pdf)\".*/){
$url2 = 'http://www3.interscience.wiley.com'.$1;
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url2.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url2\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url2 was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url2\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## Wiley #
########################################################
if($url =~ /doi\.wiley\.com/){
if($res->content =~ /\<h1\>Error\<\/h1\>/){
print "$i\.txt: Not found $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not found the page. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$notfound++;
$url="not found";
$frag = 1;
}
}
########################################################
## Synergy #
########################################################
elsif($url =~ /\.blackwell-synergy\./){
if($res->as_string =~ /\<a href\=\"javascript\:newWindow\(\'(.*\.x\/pdf)\'.*/){
$url2 = 'http://www.blackwell-synergy.com'.$1;
$req = HTTP::Request->new(GET => "$url2");
$res = $ua->request($req);
if($res->as_string =~ /\<a href\=\"(.*pdf.*)\"\>/){
$req = HTTP::Request->new(GET => "$1");
$tmp = $1;
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $tmp.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $tmp was successful.\n";
print "---------------------------------\n";
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
}
########################################################
## EMBO #
########################################################
elsif($url =~ /\/\/emboj\./){
if($res->as_string =~ /\<A HREF\=\"(.*)\"\>Reprint \(PDF\)/){
$url = 'http://emboj.oupjournals.org'.$1;
$url =~ s/content/reprint/;
$url = $url.'.pdf';
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## JVMS #
########################################################
elsif($url =~ /\/\/jvms\./){
if($res->content =~ /\<a href\=\"(.*)\"\>PDF/){
$url = 'http://jvms.jstage.jst.go.jp'.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstr\
act is saved.\n";
print LOG "--------------------------------\
-\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## J Biol Chem, J Clinical Inv and Neurology #
########################################################
elsif($url =~ /\/\/(www\.jbc\.org)/ || $url =~ /\/\/(www\.jci\.org)/ || $url =~ /\/\/(www\.neurology\.org)/ || $url =~ /\/\/(circ\.ahajournals\.org)/ || $url =~ /\/\/(www\.pnas\.org)/ || $url =~ /\/\/(www\.fasebj\.org)/ || $url =~ /\/\/(www\.jneurosci\.org)/ || $url =~ /\/\/(bioinformatics\.oupjournals\.org)/){
$tmp=0;
$tmp2=$1;
if($res->content =~ /window\.location \= \"(.*)\"\;/){
$url='http://'.$tmp2.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
$tmp=1;
}
elsif($res->content =~ /\<A HREF\=\"(.*)\"\>Reprint \(PDF\)/ || $res->content =~ /\<A HREF\=\"(.*)\"\>Screen \(PDF\)/){
$url='http://'.$tmp2.$1.'.pdf';
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
$tmp=1;
}
if($tmp == 1){
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## Nature #
########################################################
elsif($url =~ /\/\/www\.nature\.com/){
if($res->content =~ /Full text.*\"(.*)\"\>PDF/){
$url='http://www.nature.com'.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## Portlandpress #
########################################################
elsif($url =~ /\/\/cs\.portlandpress\.com/){
if($res->content =~ /\<A class\=\"sidelinks\" HREF\=\"(.*\.pdf)\"\>\<img src/){
$url='http://cs.portlandpress.com'.$1;
$req = HTTP::Request->new(GET => "$url");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $url was successful.\n";
print "---------------------------------\n";
print LOG "URL: $url\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
########################################################
## Elsevier #
########################################################
elsif($url =~ /\/\/linkinghub\.elsevier\.com/){
if($res->content =~ /\<a HREF\=\"(.*)\"\>\<img border.*src\=\"http\:\/\/www\.sciencedirect\.com\//){
$tmp = $1;
$tmp =~ s/amp\;//g;
$req = HTTP::Request->new(GET => "$tmp");
$res = $ua->request($req);
}
if($res->content =~ /.*\"(.*\.pdf)\".*/){
$req = HTTP::Request->new(GET => "$1");
$tmp = $1;
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $tmp.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $tmp was successful.\n";
print "---------------------------------\n";
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
else{
$frag=1;
print "$i\.txt: Not permitted in $tmp.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
}
########################################################
## ScienceDirect and Others #
########################################################
else{
if($res->as_string =~ /.*\"(.*\.pdf)\".*/){
$tmp = $1;
$tmp =~ s/UADB\/xppview\/// if($url =~ /\.acs\.org\//);
$req = HTTP::Request->new(GET => "$tmp");
$res = $ua->request($req);
if($res->is_success){
if($res->content =~ /^\%PDF.*/){
$frag = 1;
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $tmp was successful.\n";
print "---------------------------------\n";
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
else{
if($url =~ /(http\:\/\/.*?)\/.*/){
$tmp=$1.$tmp;
}
$req = HTTP::Request->new(GET => "$tmp");
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $tmp.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $tmp was successful.\n";
print "---------------------------------\n";
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
}
elsif($res->as_string =~ /.*\"(http\:\/\/.*pdf.*)\".*/){
$req = HTTP::Request->new(GET => "$1");
$tmp = $1;
$res = $ua->request($req);
if($res->is_success){
$frag = 1;
unless($res->content =~ /^\%PDF.*/){
print "$i\.txt: Not permitted in $tmp.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
last;
}
open(PDF, ">$dir/$i".'.pdf');
print PDF $res->content;
close(PDF);
print "$i\.pdf: Download from $tmp was successful.\n";
print "---------------------------------\n";
print LOG "URL: $tmp\n";
print LOG "FILE: $i\.pdf\n";
print LOG "State: Download successfully.\n";
print LOG "---------------------------------\n\n";
$download++;
}
}
}
}
}
if($frag == 0 & $frag2 == 1){
if($url =~ /.*\.sciencedirect\..*/){
print "$i\.txt: Not permitted in $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Not permitted. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$permit++;
}
else{
print "$i\.txt: Invalid format from $url.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL: $url\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: Invalid format. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$invalid++;
}
}
elsif($frag == 0 & $frag2 == 0){
print "$i\.txt: There is no PDF document.\n";
print "---------------------------------\n";
open(TXT, ">$dir/$i".'.txt');
print TXT "$journal\n$title\n$authors\n$affiliation\n$abstract\n$pmid\n";
close(TXT);
print LOG "URL:\n";
print LOG "FILE: $i\.txt\n";
print LOG "State: There is no PDF document. Only abstract is saved.\n";
print LOG "---------------------------------\n\n";
$nopdf++;
}
}
}
print "Searched from ".$i." papers.","\n";
print "Downloaded: $download\n";
print "Not permitted: $permit\n";
print "Not found: $notfound\n";
print "Invalid format: $invalid\n";
print "No PDF: $nopdf\n";
if($i-$nopdf != 0){
print sprintf("%d",$download/($i-$nopdf)*100)."\% of papers have been downloaded.\n\n";
}
else{
print "0% of papers have been downloaded.\n\n";
}
print "Directory is\" $dir\".\n";
print "Log file is\" $dir\/$dir\.log\"\n";
print "Key file is\" $dir\/$dir\.key\"\n" if($key);
print LOG "Total: $i\n";
print LOG "Downloaded: $download\n";
print LOG "Not permitted: $permit\n";
print LOG "Not found: $notfound\n";
print LOG "Invalid format: $invalid\n";
print LOG "No PDF: $nopdf\n";
if($i-$nopdf != 0){
print LOG "Accuracy: ".sprintf("%d",$download/($i-$nopdf)*100)."\%\n";
}
else{
print LOG "Accuracy: 0%\n";
}
close(LOG);
if($key){
KeySearch($dir, $key);
}
}
sub WordCount
{ &opt_default();
my @args=opt_get(@_);
my $file=shift @args;
my $query=shift @args;
my $count;
open(FILE,"$file");
while(<FILE>){
$count+=s/${query}/${query}/g;
}
return $count;
}
sub new
{ my $pkg = shift;
my $filename = shift;
my $option = shift;
my $this;
return $this;
}
General documentation