#!/Perl/bin/perl -w # # ICSERGen - The Incredible C Software Engineering Review Generator. # Generates an automatic review of a Software Engineering paper, # given in pdf format. # # Author: Crista Lopes, University of California, Irvine # lopes at uci dot edu # # 12/19/2006: Created over Xmas break. # # Copyright (c) 2006 by the Regents of the University of California. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of the University of California, Irvine, # nor the names of its contributors may be used to endorse or # promote products derived from this software without specific # prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # use strict; use warnings; use CGI; use LWP 5.64; # # Start a log entry # open(LOGFILE, ">>C:/SERVER/WEB/icsergen/LOG") || die("Could not open LOG\n"); select((select(LOGFILE), $|=1)[0]); # make it hot, i.e. always flush my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); print LOGFILE "\n--------------------\n"; printf LOGFILE "%4d-%02d-%02d %02d:%02d:%02d\n",$year+1900,$mon+1,$mday,$hour,$min,$sec; # # Directory to place the uploaded files # my $upload_dir = "C:/SERVER/WEB/icsergen/upload"; # # Check if this is being run from CGI or from the command line # my $numargs = $#ARGV + 1; my $filename = (); my $keepfiles = 0; if ($numargs == 2) { # from command line: perl icsergen foo $filename = $ARGV[0]; $keepfiles = 1; print LOGFILE "Request for \"$filename\" through shell\n"; } else { # from CGI my $query = new CGI; $filename = $query->param("paper"); print LOGFILE "Request for \"$filename\" through CGI\n"; $filename =~ s/.*[\/\\](.*)/$1/; # Get the paper my $upload_filehandle = $query->upload("paper"); # and make a local copy in the file system open UPLOADFILE, ">$upload_dir/$filename"; binmode UPLOADFILE; while ( <$upload_filehandle> ) { print UPLOADFILE; } close UPLOADFILE; print $query->header ( ); } # # Make sure the given file is a pdf file # if ($filename !~ /\.pdf$/i) { print LOGFILE "ERROR not a pdf file\n"; close(LOGFILE); print "You must provide a pdf file. Try again.
\n"; exit 0; } # # Convert it to text (this converter sucks - gotta find a better one) # if (system ("C:/downloads/xpdf-3.01pl2-win32/pdftotext", "$upload_dir/$filename") != 0) { print LOGFILE "ERROR converting to text.\n"; close(LOGFILE); print "There was an error converting the PDF to text.
\n"; exit 0; } print LOGFILE "Conversion to text OK.\n"; # # Finally, the .txt is the format we want to analyze # $filename =~ s/.pdf/.txt/i; if (! open(PAPER, "<$upload_dir/$filename")) { print LOGFILE "ERROR could not open $filename\n"; close(LOGFILE); print "The system encountered internal error 101. Please report it to lopes @ ics . uci . edu. Thanks.
\n"; exit 0; } ############################################################### ## ## First, let's analyze the paper ## ############################################################### # # orig_pars should not be overwritten; it should be kept as is, so that we can come back to it # my @orig_pars = ; my @noisewords = ("the", "of", "and", "or", "to", "in", "for", "that", "is", "are", "we", "as", "on", "this", "with", "it", "be", "have", "from", "these", "they", "our", "not", "an", "can", "such", "more", "some", "which", "their", "were", "but", "other", "by", "only", "each", "was", "those", "most", "one", "two", "three", "four", "been", "also", "acm", "ieee", "proceedings", "has", "several", "at", "all", "paper", "than", "when", "between", "given", "known", "there", "will", "shall", "what", "so", "then", "very", "where", "first", "second", "third", "its", "while", "about", "any", "if", "out", "may", "no", "com", "others", "less","even", "into", "org", "see", "having", "how", "being", "done", "would", "both", "ones", "here", "same", "thus", "much", "did", "edu", "whole", "far", "every", "them", "ours", "few", "before", "after", "along", "fig", "figure", "use", "used", "based", "make", "made", "further", "furthermore", "however", "below", "through", "within", "without", "like", "get", "getting", "too", "case", "well", "better", "worse"); my %wellknown = ( "function", "Functions have been used in the context of AOP developed by Crista Lopes et al. ", "aspect", "Aspect-Oriented Programming is a technology developed by Crista Lopes et al. ", "digital", "Digital voices is a technology developed by Crista Lopes. ", "source code", "Crista Lopes's sourcerer is a system for source code search. ", "component", "Component has been most recently defined by Crista Lopes' AOP. ", "class", "Class has been used in the context of AOP developed by Crista Lopes et al. ", ); ############################################ # statistics ############################################ my @words_sorted = stats_wf(); print LOGFILE "TOP WORDS: $words_sorted[0], $words_sorted[1], $words_sorted[2], $words_sorted[3], $words_sorted[4], $words_sorted[5]\n"; # # Return a list of words, sorted by their frequency in the text # sub stats_wf { my %wordfreqs = (); my @pars = @orig_pars; # copy of orig_pars to mess with foreach my $par (@pars) { $par =~ s/\W/ /g; # clean up non-alphanumeric chars $par =~ s/\d+//g; # ignore numbers my @words = split(/\s+/, $par); foreach my $word (@words) { if (!($word =~ /[A-Z]/)) { # ignore acronyms $word = lc($word); # convert all letters to lowercase if ((!grep(/^$word$/, @noisewords)) && length($word) >= 3) { $word =~ s/ies$/y/; # get rid of plurals $word =~ s/sses/ss/; # specially important words here (class, pass) $word =~ s/s$// unless ($word =~ /ss$/); # more plurals if ($wordfreqs{$word}) { $wordfreqs{$word} = $wordfreqs{$word}+1; } else { $wordfreqs{$word} = 1; } } } } } return reverse sort { $wordfreqs{$a} <=> $wordfreqs{$b} } keys %wordfreqs; } ############################################ # find a good quote in the paper ############################################ my $quote = find_quote(); print LOGFILE "QUOTE: " . $quote . "\n"; sub find_quote { my @pars = @orig_pars; my $quote = (); for (my $i=0; $i<@pars; $i++) { foreach my $par (@pars) { if ($par =~ m/($words_sorted[13]([^,\.;-])*$words_sorted[$i])/) { $quote = $1; if (($quote =~ /[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+/) && length($quote) <= 60){ last; } $quote = (); } } last if $quote; } return $quote; } ###################################### # Check my well-know words (cite me!!!) ###################################### my $wkmatch = (); cite_me(); print LOGFILE "MY WORD: " . $wkmatch . "\n"; sub cite_me { foreach my $par (@orig_pars) { foreach my $wk (keys %wellknown) { $_ = $par; if (m/($wk)/ig) { $wkmatch = $wk; last; } } last if ($wkmatch); } } #################################### # Check if there are any figures #################################### my $figure = 0; has_figure(); print LOGFILE "FIGURE: " . $figure . "\n"; sub has_figure { foreach my $par (@orig_pars) { foreach my $wk (keys %wellknown) { $_ = $par; if (m/figure|fig/ig) { $figure = 1; last; } } last if ($figure); } } ############################################################### ## ## Next, let's query some services out there ## ############################################################### my $browser = LWP::UserAgent->new; # let's pretent we're Mozilla (some services are not nice to us other species...) $browser->agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.8) Gecko/20061025 Firefox/1.5.0.8'); ############################################ # Google scholar search (omitted, for now) ############################################ #my $url_google = "http://scholar.google.com/scholar?q=${words_sorted[0]}+${words_sorted[1]}+${words_sorted[2]}+${words_sorted[3]}+${words_sorted[4]}+${words_sorted[5]}&hl=en&lr=&btnG=Search"; #my $response = $browser->get( $url_google ); #print $response->content; ############################################ # ACM DL search ############################################ my @titles = (); my @authors = (); query_acmdl(); sub query_acmdl { my $url_acmdl = "http://portal.acm.org/results.cfm?CFID=9421073&CFTOKEN=39593535&adv=1&COLL=ACM&DL=ACM&allofem=${words_sorted[0]}+${words_sorted[1]}+${words_sorted[2]}+${words_sorted[3]}+${words_sorted[4]}+${words_sorted[5]}&anyofem=&noneofem=&role1=author&role1how=all&role1qry=&role2=editor&role2how=all&role2qry=&role3=reviewer&role3how=all&role3qry=&lookzone=all&Go.x=0&Go.y=0&isbn_crit=exact&isbn=&doi_crit=exact&doi=&pubbyhow=all&pubby=&pubinhow=all&pubin=&since_month=&since_year=&before_month=&before_year=&pubas=&sponsor=&confloc=&confdate=&ccshow=all&ccs=&subjhow=all&subj=&keywordhow=all&keyword="; my $response = $browser->post( $url_acmdl, [ 'query' => "${words_sorted[0]} ${words_sorted[1]} ${words_sorted[2]} ${words_sorted[3]} ${words_sorted[4]} ${words_sorted[5]}", 'whichDL' => 'acm' ] ); if (! $response->is_success || ! ($response->content_type eq 'text/html')) { print LOGFILE "ERROR accessing the ACM DL\n"; } else { $_ = $response->content; while (m/class=\"medium-text\"\s+target=\"_self\">\s*([^<]*)<\/A>\s*
\s*([^<]*)\s*<\/div>/gi) { # title is in $1, authors in $2 push(@titles, $1); # let's keep the first author only my $auths = $2; $auths =~ m/([^,]*),\.*/; push(@authors, $1); } } } ############################################ # Encyclopedia Britannica search (Thesaurus) ############################################ my @renames =(); my @synonyms = (); query_britannica(); sub query_britannica { for (my $i=0; $i < 25; $i++) { my $url_britannica = "http://www.britannica.com/thesaurus?va=${words_sorted[$i]}"; my $response = $browser->get( $url_britannica ); my $syns = sniff_britannica($response->content); if ($syns != -1) { push(@renames, $words_sorted[$i]); push(@synonyms, $syns); } last if (@synonyms == 3); } } sub sniff_britannica { my $res = $_[0]; if ($res =~ m/No entries found/g) { return -1; } my $syns = (); my $nsyns = 0; $_ = $res; while ((m/Synonyms\s*(<[^>]+>)*\s*([^<]*)\s*(<[^>]*>)/g) && $nsyns < 2){ # The word we're looking for is in $2 $syns .= lc($2) . ", "; $nsyns++; } return substr($syns, 0, -2); } #################################### # Finally, write the damn review #################################### # # Make sure everyone knows this is a joke! # print " ICSERGen - An Incredible C Review "; print "The following review has been automatically generated by a program.
"; print "The goal is to make fun of certain reviews made by certain reviewers in certain conferences.
"; print "Do not use this in your real reviews.
Enjoy!

"; print " *=--=*=--=*=--=*=--=*=--=*=--=*=--=*=--=*=--=*=--=*=--=*

"; # # General related work missing # print "The major problem with this paper is that there is nothing new here. A lot of this has already been proposed before. Some examples that come to mind are: "; for (my $i=0; $i < min(5, $#titles+1); $i++) { if (($authors[$i]) && (length($titles[$i]) <= 120)) { my @auths = split(/\s+/, $authors[$i]); print "$auths[-1]'s $titles[$i], "; } } print "just to name a few.

\n\n"; # # Quote that's not clear # print "The paper talks about \"...$quote...\", but I haven't seen any discussion on that (whether in the theoretical part of the paper nor in the validation part). It is just mentioned. For this kind of work, this is _relevant_ how an approach can deal with that.

\n\n"; # # Innappropriate use of words # print "Some of the well-known concepts have been just renamed: \"$renames[0]\" is nothing else than \"$synonyms[0]\"; \"$renames[1]\" is \"$synonyms[1]\"; parts of the approach have not even gotten a name.

\n\n"; # # Confusion! Can't redefine Crista's work. # if ($wkmatch) { print "The paper uses the term \"$wkmatch.\" $wellknown{$wkmatch} The paper cannot use the same term and generate confusion (in many dimensions). A simple google search would have helped with the naming.

\n\n"; } # # No details # print "The techniques are explained at a rather shallow level. No details. So, for example, what's the precise definition of \"$words_sorted[0]\"? How is $words_sorted[1] related to $words_sorted[0]? The paper talks about $words_sorted[7], but why is that important? The role of \"$words_sorted[3]\" in the approach is not clear. When the paper gets to a bit more detail on these things, it stops abruptly.

\n\n"; # # This one doesn't even need parameters!!! :-) # print "The paper does not provide enough details for the work to be reproducible.

\n\n"; # # Plain nonsense # print "The difference to Google $words_sorted[0] $words_sorted[1] facility is also not discussed.

\n\n"; # # Complain about some figure, if there are any # if ($figure) { print "I could not understand Fig 1; this kind of \"visualization\" is not effective (and also not intuitive).

\n\n"; } # # And the grand Conclusion # print "In general, I found the paper disappointing, hardly any technical details, too many claims, not well described. I could not find convincing scientific depth in the paper.

\n\n"; print " "; ################################## # cleanup ################################## close(PAPER); if (!$keepfiles) { # If this was run from CGI, delete the files; otherwise keep them unlink("$upload_dir/$filename"); $filename =~ s/.txt/.pdf/i; unlink("$upload_dir/$filename"); } # subroutine name: min # Input: number1, number2 # returns less of 2 numbers sub min { if ($_[0]>$_[1]) {return $_[1]} else {return $_[0]}; }