#!/usr/bin/perl -w #,---- #| rename_papers.pl --- Rename papers based on their titles #| Created Mon Mar 14 2005 by Tom Fawcett () #| Copyright (2005) by Tom Fawcett #`---- # # USAGE: # perl rename_papers.pl file1 file2 ... # # COMMENTARY: # I often download papers from the net in PDF or PS format. # They often have cryptic uninformative filenames like # steig12dist.pdf or KDD2003.ps.gz. I decided I wanted to # rename them with their titles. This script does that # automatically. It scans each file, extracts the text, and # tries to find a title at the beginning. It then presents you # with what it thinks is the title and lets you edit it, then # renames the file. If you delete the text completely, the # rename is cancelled. # # This script needs the following things: # - the perl module Term::Readline. # # - pdftotext, to handle .pdf files. It's part of the xpdf # package: http://www.foolabs.com/xpdf/download.html # # - pstotext, to handle .ps files: # http://research.compaq.com/SRC/virtualpaper/pstotext.html # or ps2ascii, from: # http://www.cs.wisc.edu/~ghost/ # # You also probably need a working ghostscript interpreter and # various of the GNU utilities. ############################################################################### use English; use strict; use File::Temp(":POSIX"); use Term::ReadLine; use constant PDF_TO_ASCII => "pdftotext"; use constant PS_TO_ASCII => "ps2ascii"; my $DEBUG = 1; # These characters will be converted to underscores so they don't # confuse the filesystem. my $ILLEGAL_CHARACTERS = "/\\\\ "; # We scan this far into the text to try to find a title use constant TOP_LINES_TO_SCAN => 20; my $tempfile = tmpnam(); print "tempfile = $tempfile\n" if $DEBUG; my $term = new Term::ReadLine 'rename_papers'; my $was_gzipped; # Flag: whether the original file was gzipped or not my $new; for my $original (@ARGV) { unless ( -e $original ) { warn "file $original doesn't exist -- skipping\n"; next; } # Grab extension my ($ext) = $original =~ /\.([^\.]+)$/; if ( !defined($ext) ) { warn "file $original has no extension -- can't tell what it is\n"; next; } # See if the file was gzipped and prepare to process the uncompressed # file if so. $was_gzipped = 0; if ( defined($ext) and $ext eq "gz" ) { if ( system("gunzip \"$original\"") ) { warn "Error gunzipping $original, skipping it\n"; next; } $was_gzipped = 1; # Have to recompute original filename and extension without gz ($original) = $original =~ /^(.*?)\.gz$/; ($ext) = $original =~ /\.([^\.]+)$/; } my $cmd; if ( $ext eq "pdf" ) { $cmd = "pdftotext -f 1 -l 2 \"$original\" $tempfile"; } elsif ( $ext eq "ps" ) { # $cmd = "pstotext \"$original\" > $tempfile"; $cmd = "ps2ascii \"$original\" $tempfile"; } else { print "Don't know what to do with extension .$ext, skipping file\n"; next; } if ( system($cmd) ) { warn("Error running $cmd\nSkipping\n"); next; } # Basic text is now in tempfile my $title = get_title($tempfile); if ( !defined($title) ) { print "Couldn't get a title from $original\n"; next; } elsif ( $original eq "$title.$ext" ) { next; } $new = "$title.$ext"; $new =~ s/[$ILLEGAL_CHARACTERS]/_/g; RENAME: while (1) { print "Delete entire line to cancel rename\n"; print "Enter DEL to delete the file\n"; $new = $term->readline( "New name for $original = ", $new ); if ( !defined($new) or $new eq "" ) { print "Rename cancelled\n"; $new = $original; last RENAME; } elsif ( -e $new ) { print "File $new already exists, won't overwrite it\n"; print "Choose another name\n"; } elsif ( $new eq "DEL" ) { print "Moving $original to trash\n"; system("trash $original") and die; last RENAME; # Exit while loop } else { rename( $original, $new ) or warn "Error renaming $original to $new\n"; last RENAME; # Exit while loop } } } continue { # Before we go to next file, re-zip this file if necessary if ($was_gzipped) { if ( system("gzip -9 \"$new\"") ) { warn "Error gzipping \"$new\"\n"; } } } unlink($tempfile); # remove tempfile exit(0); # Given an ascii file, return the first line that might be a title. # Returns undef if nothing could be found. # This is a hack, of course, but it works for many things. sub get_title { my ($file) = @_; my $title; open( TMP, $file ) or die("open($file): $!"); if ($DEBUG) { print "Header text from file:\n"; } while () { chomp; last if $INPUT_LINE_NUMBER > TOP_LINES_TO_SCAN; print "Seeing: $_\n" if $DEBUG; # Often publication info precedes the title -- skip it. if ( /JOURNAL/i or /PROCEEDINGS/i or /SUBMITTED/i or /PUBLISH/i or /(19\d\d|20\d\d)/ or /MANUSCRIPT/i or /EDITOR/i or /IN PRESS/i or /AVAILABLE ONLINE/i or /COPYRIGHT/i ) { print "Skipping because of $MATCH\n" if $DEBUG; next; } s/^\s+//; # Remove leading whitespace s/\s+$//; # and trailing next if $_ eq ""; # Skip blank lines. next unless /[a-zA-Z]/; # Must have alphabetical text my (@words) = split; next unless @words > 1; # Must be at least two words $title = $_; # Looks like we have a title last; } close(TMP); $title; } ##### End of rename_papers.pl