#!/usr/bin/perl

my $VERSION = 0.2;

package Fileinfo;
use strict;
use warnings;
use File::Basename ();
use File::Spec;
use File::Listing;

*pathjoin = sub{ File::Spec->join(@_) };

# constructor.
# new Fileinfo( $local_path )
# new Fileinfo( $remote_path, $ls_entry, $ftp)
sub new {
    my $f = bless +{}, (ref($_[0])? ref(shift) : shift);
    my($path,$ls_entry, $ftp)= @_;

    $f->path( $path );
    $f->ftp($ftp);
    $f->size($f->mtime($f->mode($f->exists(0))));
    $f->type('?');
    $f->alphadog(1);

    if (defined $ls_entry){

        # It's an `ls' entry returned from parse_dir as
        # [ $name, $type, $size, $mtime, $mode ]
        $f->type($ls_entry->[1] || '?');
        $f->size($ls_entry->[2]);
        $f->mtime($ls_entry->[3]);
        $f->mode($ls_entry->[4]);
        $f->exists(1);

    }
    else{
        # It's a local path.
        my @s = lstat $f->path;
        if (@s){
            $f->type ( -f _ ? 'f':
                       -d _ ? 'd':
                       -l _ ? 'l':
                              '?');
            $f->size($s[7]);
            $f->mtime($s[9]);
            $f->mode($s[2]);
            $f->exists(1);
        }
    }
    $f->dirname(File::Basename::dirname($f->path));
    $f->basename(File::Basename::basename($f->path));
    $f->{'ls'} = []; # Don't call method here.
    return $f;
}

# create accessors
for my $attr (qw( path dirname basename size mtime mode
                  type exists has_ls ftp alphadog))
{
    no strict 'refs';
    *{ __PACKAGE__ . "::" . $attr} = sub { $_[0]->{$attr} = $_[1] if @_ > 1;
                                           $_[0]->{$attr} };
}

sub is_dir  { shift->type eq 'd' }
sub is_link { shift->type eq 'l' }
sub is_file { shift->type eq 'f' }
sub to_string {
    my $path = shift->path;
    defined($path) ? $path : 'undef'
};

sub ls {
    my $f = shift;
    return $f->{'ls'} unless $f->is_dir;

    if (defined($f->path) && $f->is_dir &&  not $f->has_ls){
        $f->{'ls'} = [];

        if (defined $f->ftp){
            # remote

            for my $ls_entry (parse_dir($f->ftp->dir($f->path))){
                next if $ls_entry->[0] =~ /^\.\.?$/;
                push @{$f->{'ls'}}, new Fileinfo( pathjoin($f->path, $ls_entry->[0]),
                                                  $ls_entry, $f->ftp);
            }
            $f->has_ls(1);
        }
        else{
            # local
            opendir DIR, $f->path or ::fail("Could not opendir ". $f->path);
            while (my $dir_entry = readdir DIR){
                next if $dir_entry =~ /^\.\.?$/;
                push @{$f->{'ls'}}, new Fileinfo( pathjoin($f->path, $dir_entry));
            }
            close DIR;
            $f->has_ls(1);
        }
    }

    return $f->{'ls'};
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
package main;
use strict;
use warnings;
use POSIX qw(setuid setgid);
use URI;
use Getopt::Long;
use Pod::Usage;
use Net::FTP;
use File::Path ();
use File::Find;
use File::Basename ();
use File::Spec;
use File::Listing;
use constant {
              Defer_It  => 0,
              Skip_It   => 1,
              Fetch_It  => 2,
              Delete_It => 4,
              };
*pathjoin = sub{ File::Spec->join(@_) };

## OPTIONS
my ($opt_verbose,     $opt_allow_root) = (0,0);
my ($opt_existing,    $opt_delete,  $opt_debug)      = (0,0,0);
my ($opt_passive_ftp, $opt_squish,  $opt_recurse)    = (0,0,0);

my $opt_maxdepth = 2**31;
my ($opt_setuid, $opt_setgid);

my %known_mirrors = ( 'cpan' => \&opt_cpan,
                      'gnu' => \&opt_gnu
                    );

## REGEXPS LISTS
my (@rx_paths,@rx_basenames);
my (@rx_not_paths,@rx_not_basenames);

## PARSE COMMAND LINE
GetOptions(
        'existing' => \$opt_existing,
        'delete'   => \$opt_delete,
        'passive-ftp!' => \$opt_passive_ftp,
        'verbose!'  => \$opt_verbose,
        'debug!'    => \$opt_debug,,
        'version'   => sub { print "mirror-tightwad $VERSION\n"; exit 0 },
        'squish!' => \$opt_squish,
        'allow-root!' => \$opt_allow_root,
        'setuid=s' => \$opt_setuid,
        'setgid=s' => \$opt_setgid,
        'maxdepth=i' => \$opt_maxdepth,
        'recurse!'  => \$opt_recurse,
        'mirror=s'  => sub{ 
                            fail("Unknown mirror '$_[1]'") unless exists $known_mirrors{$_[1]};
                            $known_mirrors{$_[1]}->();
                          },
        'path=s'       =>  sub{  opt_regexp($_[1],\@rx_paths,\@rx_not_paths)            },
        'basename=s'   =>  sub{  opt_regexp($_[1],\@rx_basenames,\@rx_not_basenames)    },
    ) && scalar(@ARGV) == 2 or pod2usage( -exitval => 1, -verbose => 1);

$opt_verbose = 1 if $opt_debug;

my $uri = new URI($ARGV[0]);
$uri->scheme eq 'ftp' or fail("Only ftp:// URIs are supported.");
my $local_root = new Fileinfo($ARGV[1]);

## DEFINE THE RULES
my (@dir_rules,@file_rules);

## Dir rule: existing
push @dir_rules,  sub { $_[1]->exists ? Defer_It : Skip_It } if $opt_existing;

## Dir rule: regexps
push @dir_rules, sub {
        return Skip_It  if @rx_not_paths && scalar grep { $_[0]->path =~ $_ } @rx_not_paths;
        return Fetch_It if @rx_paths     && scalar grep { $_[0]->path =~ $_ } @rx_paths;
        Defer_It;
};

## File rule: existing
#push @file_rules, sub { $_[1]->exists ? Defer_It : Skip_It } if $opt_existing;

## File rule: regexps
push @file_rules, sub {
        return Skip_It if @rx_not_basenames &&  scalar grep{$_[0]->basename =~ $_} @rx_not_basenames;
        return Skip_It if @rx_not_paths &&  scalar grep {$_[0]->path =~ $_} @rx_not_paths;

        return Fetch_It if @rx_basenames && scalar grep { $_[0]->basename =~ $_ } @rx_basenames;
        return Skip_It  if @rx_basenames;
        return Fetch_It if @rx_paths &&  scalar grep { $_[0]->path =~ $_} @rx_paths;
        return Skip_It  if @rx_paths;
        Defer_It;
};


## Running as root must be explictly allowed
if ( $> == 0 ) { ## we're running as root.
   if (not $opt_allow_root){
       if (not defined($opt_setuid) or not defined($opt_setgid)){
           fail("Refusing to run as root. Use --allow-root or --setuid=USER --setgid=GROUP");
       }
       my $uid =  $opt_setuid =~ /^\d+$/ ? $opt_setuid : getpwnam $opt_setuid;
       my $gid =  $opt_setgid =~ /^\d+$/ ? $opt_setgid : getgrnam $opt_setgid;
       verbose("Setting gid to $gid\n");
       POSIX::setgid($gid);
       verbose("Setting uid to $uid\n");
       POSIX::setuid($uid);
   }
   else{
       verbose("running as superuser");
   }
}


debug("uri: "       . $uri->as_string );
debug("uri_path: "  . $uri->path      );
debug("local_root: " . $local_root->path);

## CONNECT
verbose("Opening " . ($opt_passive_ftp ? 'passive ' : ' ')
       . $uri->scheme . " connection to "
       . $uri->host   . " port "
       . $uri->port);
my $ftp = Net::FTP->new($uri->host, Passive => $opt_passive_ftp) or fail($@);

## LOGIN
verbose("Login");
$ftp->login or fail("Could not login");

## CHDIR
verbose("Cwd " . $uri->path);
$ftp->cwd($uri->path) or fail("Could not cwd to remote path: " . $uri->path);

## Pretend that we got a dir list entry for the remote root
## -- because dir("..") will not work at the top level.
my $remote_pwd = $ftp->pwd;
my $remote_root = new Fileinfo($remote_pwd, [ $remote_pwd , 'd', 1, time(), 0555 ], $ftp);

ftp_dir_walk($ftp, $remote_root, $local_root, 0);

verbose("exiting");
exit 0;


######################################################################
######################################################################

# FTP recursive descent.
sub ftp_dir_walk {
    my ($ftp, $remote_dir, $local_dir, $depth) = @_;

    #debug("ftp_dir_walk( " . $remote_dir->path .", ". $local_dir->path . ")\n");

    verbose("Reading directory  " . $remote_dir->path );
    debug("Local directory is " . $local_dir->path );
    my $entries = $remote_dir->ls;
    build_squishes($entries);

    my %touch; # local_basename => [remote,local]

    ## Call file/link rules
    foreach my $new_remote_file ( grep { $_->is_file || $_->is_link } @$entries) {
        my $new_local_file = new Fileinfo(pathjoin($local_dir->path, $new_remote_file->basename));

        $touch{$new_local_file->basename} = [$new_remote_file, $new_local_file]  if
            should_fetch_file($new_remote_file, $new_local_file);

    }

    ## Delete files not in the touch hash.
    if ($opt_delete ){
        for (grep{$_->is_file||$_->is_link } @{$local_dir->ls}){
            if (not exists $touch{$_->basename}){
                verbose("unlinking " . $_->path );
                unlink $_->path;
            }
        }
    }

    ## Touch the files we want rsync to update
    for my $a (values %touch){
        touch_file(@$a);
    }

    ## Maxdepth cutoff
    return if $depth >= $opt_maxdepth;

    ## Recurse
    return unless $opt_recurse;

    for my $new_remote_dir ( grep {$_->is_dir}  @$entries ){
        my $new_local_dir = new Fileinfo(pathjoin($local_dir->path, $new_remote_dir->basename));

        ## Call dir rules
        my $action = apply_rules($new_remote_dir,$new_local_dir,\@dir_rules, Fetch_It);
        next if $action == Skip_It;

        ## Recurse
        ftp_dir_walk ($ftp, $new_remote_dir, $new_local_dir, $depth+1);
    }
}

## Make the big decision.
sub should_fetch_file {
    my ($R, $L) = @_;

    ## Don't even look at it unless it passes the regexp rules
    return 0 unless apply_rules($R, $L, \@file_rules, Skip_It) == Fetch_It;

    ## Squish unless it is the alpha dog.
    return 0 if $opt_squish && not $R->alphadog;

    ## Apply the 'existing' constraint.
    if ($opt_existing){
        # TODO
    }

    ## Ok
    return 1;
}

sub touch_file {
    my ($remote_file, $local_file) = @_;

    ## Create the local file if it doesn't exist. rsync will do the actual fetch.
    return if $local_file->exists;

    File::Path::mkpath($local_file->dirname, ($opt_verbose? 1:0), 0777);

    verbose("Touching file " . $local_file->path );

    open TOUCH, ">" . $local_file->path or fail("Could not create local file " . $local_file->path);
    print TOUCH $remote_file->size, "\n";
    close TOUCH;

    ## Now set the timestamp to a long, long time ago.
    my $mtime = $remote_file->mtime - (365 * 24 * 60 * 60); # about one year ago
    utime $mtime, $mtime, $local_file->path;
}

sub delete_file {
    my ($remote_file, $local_file) = @_;
}


## Applies the regexp rules and returns what to do.
## Returns undef for default action.
## apply_rules($remote,$local, $rules_list)
sub apply_rules {
    my ($remote_file, $local_file, $rules, $default) = @_;

    RULE: foreach my $r (@$rules){
        SWITCH: for ($r->($remote_file, $local_file)){
            next unless defined($_) &&  /^\d+$/;
            if($_ == Skip_It) { return $_ }
            if($_ == Fetch_It){ return $_ }
        }
    }
    return $default;
}

## Find the most recent packages in a list of files
sub build_squishes {
    my ($ls) = @_;

    return unless $opt_squish;
    my %squishes; # package => [ Fileinfo ... ]

    FILE: for my $f (grep { $_->is_file } @$ls){
        for my $rx (@rx_basenames){
            next unless $f->basename =~ $rx; # package must be in $1
            my $package = $1;
            push @{$squishes{$package}}, $f;
            next FILE;
        }

    }

    # Sort the squishes by mtime
    for my $pkg (keys %squishes){
        $squishes{$pkg} = [ sort { $b->mtime <=> $a->mtime } @{$squishes{$pkg}} ];
    }

    ## Mark the alphadog and unmark the rest
    for my $a ( values %squishes){
        for my $f (@$a){
            $f->alphadog(0);
        }
        $a->[0]->alphadog(1);
    }
}

## Callback for --FOO=regexp options.
sub opt_regexp {
    my ($expr, $list, $notlist) = @_;
    if ( $expr =~ /^\!/){
        $list = $notlist;
        $expr =~ s/^\!//;
    }

    ## Syntax check the expression
    eval { 'foo' =~ m/$expr/ };
    fail("Invalid regular expression:  $expr\n$@\n") if $@;

    push @$list, qr/$expr/;
}

# callback for option --cpan
sub opt_cpan {
    my $ext =  qr/\btar\.(?:gz|bz2)/;
    my $rev =  qr/\d+[\d\.]*/;
    push @rx_basenames, qr/^ ([\w-]+) -? $rev \. $ext $/x;

    $opt_squish = 1;
    $opt_recurse = 1;
}

# callback for option --gnu
sub opt_gnu {
    my $ext =  qr/\btar\.(?:gz|bz2)/;
    my $rev =  qr/\d+[\d\.]*/;
    push @rx_basenames, qr/^ ([\w-]+) -? $rev \. $ext $/x;

    $opt_squish = 1;
    $opt_recurse = 1;
}

sub verbose { print_safe(\*STDOUT, @_) if $opt_verbose }
sub debug   { print_safe(\*STDOUT, @_) if $opt_debug }

sub print_safe {
     my $fh = shift;
     return unless scalar( my @a =  map { defined($_)?$_:'undef' } @_);
     print $fh (my $s = join ' ', @a);
     print $fh "\n" if $s !~ m/\n$/;
}

sub fail{ print_safe(\*STDERR,@_); exit(1) }

1
__END__

=head1 NAME

mirror-tightwad - Create a subset of a mirror

=head1 SYNOPSIS

  mirror-tightwad  [OPTIONS] ftp://remote.host/path/  /local/path/


=head1 DESCRIPTION

mirror-tightwad is a utility to mirror only a subset of a large site.  The goal
is to save hard disk space by only mirroring what you want.  mirror-tightwad is
designed to be a pre-processor for rsync. First you run mirror-tightwad, then
you run rsync (or wget, etc.).

The first argument is a URL for the remote server. Currently only ftp is
supported.  The second argument is the local mirror directory.

By default, mirror-tightwad assumes you are a total tightwad, and will
not mirror anything at all. You have to explicitly ask for recursion,
and provide regular expressions to match against the files you want to grab.
See L</PATTERNS>.

=head1 PREREQUISITES

This script requires the following modules:
C<File::Basename>,
C<File::Find>,
C<File::Listing>,
C<File::Path>,
C<File::Spec>,
C<Getopt::Long>,
C<Net::FTP>,
C<POSIX>,
C<Pod::Usage>,
C<URI>,
C<constant>,
C<strict>,
C<warnings>


=head1 SQUISHING

Let's say the remote directory /pub/gnu/autoconf contains,

    autoconf-2.55.tar.bz2
    autoconf-2.55.tar.gz
    autoconf-2.56.tar.bz2
    autoconf-2.56.tar.gz
    autoconf-2.57.tar.bz2  <--- The most recent.
    autoconf-2.57.tar.gz

but you just want to mirror whichever one is the most recent. You can do this
with option B<--squish>, which tells mirror-tightwad that all the C<autoconf-*>
files in a directory belong to the same logical package, and to grab only the
most recent file in that package.  The files are squished according to the
pattern defined by B<--basename>, which is matched against the basename of the
file.  The first set of capturing parenthesis is the package name. The above
filenames could be squished like this:

    mirror-tightwad --basename='^(\w+)-[\d.]+\.tar\.(?:gz|bz2)$' ...

Since the first set of capturing parenthesis is C<(\w+)>, this pattern would
squish the above filenames into the package ``autoconf''. Then, the file with
most recent modification time will be chosen, and the rest of the files in the
autoconf package will be ignored.


=head1 PATTERNS

All patterns are Perl 5 regular expressions, with one extra helping of sugar:
if the first character is '!', then the matching will be negated. Perl 5
regular expression syntax is not documented here. See L<perlrequick(1)>, L<perlretut(1)>, and
L<perlre(1)>.

All negated patterns are attempted first, then normal patterns. If any negated
pattern succeeds, then no further matches are attempted.

Remember to use C<^> and C<$> (or C<\A> and C<\z>) where needed.

=head1 OPTIONS

=over

=item B<--allow-root>

Allows mirror-tightwad to run as superuser. If this option is not given, then
mirror-tightwad refuses to run as superuser... but see B<--setuid>.

=item B<--basename=REGEXP>

Considers only the remote file whose basename matches I<REGEXP>.  Multiple
B<--basename> options may be  specified.
Matching stops on the first successful match.
If REGEXP  starts with '!', then the match is negated.
All negated patterns are attempted before any normal matches.
Don't forget to use C<^> and C<$> (or C<\A> and C<\z>) where needed.


=item B<--debug>

Prints a lot of information useful for debugging.

=item B<--delete>

Deletes local files that are not scheduled to be mirrored. This is useful
for deleting older versions of tarballs as new versions arrive.

=item B<--existing>

Only consider files or directories that already exist locally.

=item B<--maxdepth=NUM>

Set the maximum recursion depth to NUM. The default is very large.

=item B<--mirror=NAME>

Shortcut for setting options for well-known mirrors. NAME can be one of
I<CPAN>, or I<GNU>. Feel free to override the settings by providing 
additional options. For example, C<--mirror=GNU> is similar to specifying,

  --recurse --squish --basename='^([\w-]+)-\d+[\d\.]*\.(tar\.(?:gz|bz2))$'


=item B<--passive-ftp>

Enables passive FTP mode. Useful if you are behind a firewall.

=item B<--path=REGEXP>

Considers only the remote files whose full path matches I<REGEXP>.  Multiple
B<--basename> options may be specified.  Matching stops on the first successful
attempt.  If REGEXP  starts with '!', then the match is negated.  All negated
patterns are attempted before any normal matches.  Don't forget to use C<^> and
C<$> (or C<\A> and C<\z>) where needed.

=item B<--recurse>

Enables directory recursion.

=item B<--setuid=USER>

Sets the user id before proceeding. I<USER> can be a name or numeric user id.

=item B<--setgid=GROUP>

Sets the group id before proceeding. I<GROUP> can be a name or numeric group id.

=item B<--squish>

Enable squishing defined by the first capturing parenthesis in  B<--basename>.
See L</SQUISHING>.

=item B<--verbose>

Prints details about what mirror-tightwad is doing.

=item B<--version>

Prints version information and exit.

=back

=head1 EXAMPLES

=over

=item 1. Get the latest GNU tarballs (saves about ???MB)

    mirror-tightwad --gnu --delete ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/
    rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/

=item 2. Same as above, but don't get the manuals.

    mirror-tightwad --gnu --delete --path='!/Manuals/' ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/
    rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/

=item 3. This time, only scan the the first level of directories.

    mirror-tightwad --gnu --delete --maxdepth=1 --path='!/Manuals/' ftp://ftp.gnu.org/pub/gnu/ /mirrors/gnu/
    rsync -av --existing --delete ftp.gnu.org::pub/gnu/ /mirrors/gnu/

=item 4. Only mirror what you already have

    mirror-tightwad --existing --basename='.+' ftp://host/stuff/ ~/stuff/
    rsync -av --existing --delete ftp://host/stuff ~/stuff

=back


=head1 TROUBLESHOOTING

Did you forget B<--recurse>? Did you specify a B<--basename> pattern?

Behind a firewall? Then use B<--passive-ftp>.


=head1 BUGS

Option --existing doesn't squish.
Option --quota is not yet implemented.

=pod SCRIPT CATEGORIES

Networking
UNIX/System_administration

=head1 AUTHOR

John Millaway <millaway@acm.org>

=cut

vim:set ft=perl ai si et ts=4 sts=4 sw=4 tw=0: