#===========================================================================
# 

package Sitescooper::PerSiteDirCache;

use Carp;

BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
use AnyDBM_File;
use Fcntl;

use Sitescooper::Main;
use Sitescooper::PerSiteCache;
use Sitescooper::CacheObject;
use Sitescooper::CacheSingleton;

use strict;

use vars qw{
  @ISA $SLASH
};

@ISA = qw(Sitescooper::PerSiteCache);

sub new {
  my $class = shift; $class = ref($class) || $class;

  my ($main, $factory, $robot, $sitename) = @_;
  my $self = $class->SUPER::new($main, $factory, $robot, $sitename);
  bless ($self, $class);

  $SLASH				= $Sitescooper::Main::SLASH;
  $self->{refresh}			= $self->{main}->{cf}->{refresh};
  $self->{last_modtime}			= { };
  $self->{oldest_modtime_at_host}	= { };
  $self->{ptrs_to_commit}		= { };

  $self->{singleton} = new Sitescooper::CacheSingleton ($main, $factory);

  $self->{sitename} =~ s/[^-_.A-Za-z0-9]/_/g;

  $self->{cachedir} =
  		$self->{factory}->{cachedir}.$SLASH.$self->{sitename};
  if (!-d $self->{cachedir}) {
    mkdir ($self->{cachedir}, 0777) or
               die "failed to mkdir '$self->{cachedir}'\n";
  }

  $self->{newcachedir} =
  		$self->{factory}->{newcachedir}.$SLASH.$self->{sitename};
  if (!-d $self->{newcachedir}) {
    mkdir ($self->{newcachedir}, 0777) or
	       die "failed to mkdir '$self->{newcachedir}'\n";
  }

  # Open the modification-time database.
  $self->{mod_file} = $self->{cachedir}.$SLASH."modtimes.db";

  # TODO: implement nowrite functionality for db's
  # $self->{new_mod_file} = $self->{newcachedir}.$SLASH."modtimes.db";

  # use AnyDBM_File, but use the more efficient DB_File where supported
  my $dbtype = 'AnyDBM_File';
  if ($^O !~ /win|mac|os2/i) { $dbtype = 'DB_File'; }

  # tie the db. Use O_CREAT|O_RDWR so that we *may* be able to share
  # the db between two processes running at once; we do this by using
  # it read-only during scooping, and committing the changes at the
  # end. It might just work, so it's worth a try ;)
  my %db;
  my $attempt = 0;

retry_tie:
  $attempt++; 
  if (!tie (%db, $dbtype, $self->{mod_file}, O_CREAT|O_RDWR, 0644))
  {
    warn "Cannot open/create database: $self->{mod_file}: $@ $!\n";
    if ($! =~ /file exists/i && $attempt == 1)
    {
      # STUPID redhat 7.1 upgrade error; it barfs if DB_File now uses a
      # different libdb version in the background. delete the database
      # and restart it. STUPID STUPID Red Hat. GRRR.
      # 
      unlink ($self->{mod_file}) or die "cannot unlink $self->{mod_file}: $@ $!\n";;

      warn "Warning: database file support has been broken, probably due to Red Hat\n".
	   "\tstupidity.  Cache data has been deleted, to work around this.\n";

      goto retry_tie;
    }

    die "Cannot recover from tie error: $self->{mod_file}: $@ $!\n";
  }

  $self->{mod_db} = \%db;

  $self;
}

sub commit {
  my ($self) = @_;

  my ($from, $to);

  if (!$self->{main}->{cf}->{nowrite}) {
    while (($from,$to) = each %{$self->{oldest_modtime_at_host}}) {
      $self->dbg ("Saving new modtime age cache entry: $from => $to ".
	      "(".$self->{main}->time2datestr($to).")");

      $self->{mod_db}->{'O'.$from} = $to;
    }

    while (($from,$to) = each %{$self->{last_modtime}}) {
      $self->{mod_db}->{'L'.$from} = $to;
    }

    while (($from,$to) = each %{$self->{ptrs_to_commit}}) {
      $self->{mod_db}->{'P'.$from} = $to;
    }
  }

  untie ($self->{mod_db}) or warn "Untie failed: $self->{mod_file}\n";
}

# ---------------------------------------------------------------------------

sub set_last_modtime {
  my ($self, $url, $mod) = @_;

  return unless (defined $url);
  return unless (defined $mod);

  $url = Sitescooper::Util::URLWithoutAnchor ($url);
  $self->{last_modtime}->{$url} = $mod;
  $self->update_oldest_page_at_site ($url, $mod);
}

sub get_last_modtime {
  my ($self, $url) = @_;

  return undef unless (defined $url);

  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  my $mod = $self->{last_modtime}->{$url};
  if (!defined($mod) && !$self->{refresh})
  {
    # $mod = $self->{factory}->{last_modtime}->{$url};
    $mod = $self->{mod_db}->{'L'.$url};
  }

  if (defined $mod) {
    # ensure this entry doesn't get expired from the cache
    $self->update_oldest_page_at_site ($url, $mod);
  }
  return $mod;
}

# ---------------------------------------------------------------------------

sub update_oldest_page_at_site {
  my ($self, $url, $lastmod) = @_;
  return unless defined ($lastmod);
  
  $url =~ m,http://(\S+?)/,i; my $urlhost = $1;
  return unless defined ($urlhost);
  
  #$self->dbg ("checking to see if $url is oldest at its site: modtime=".
        #(defined $self->{last_modtime}->{$url} ? $self->{last_modtime}->{$url} : "unknown)"));

  if (defined $self->{last_modtime}->{$url})
  {
    my $orig = $self->{oldest_modtime_at_host}->{$urlhost};
    if (!defined($orig) || $orig > $self->{last_modtime}->{$url}) {
      $self->dbg ("oldest link seen at $urlhost $url: modtime=".
                $self->{last_modtime}->{$url}." (".
                $self->{main}->time2datestr($self->{last_modtime}->{$url}).")");

      $self->{oldest_modtime_at_host}->{$urlhost} =
                        $self->{last_modtime}->{$url};
    }
  }
}

# ---------------------------------------------------------------------------

sub mk_generic_cache_filename {
  my ($dir, $url, $use_deep_fnames) = @_;

  if (!defined $dir) { return undef; }

  $url = Sitescooper::Util::URLWithoutAnchor ($url);        # trim #anchors
  $url =~ s,^http://,,i; $url =~ s,^www\.,,i;	# trim common stuff
  $url =~ s/[^-_A-Za-z0-9]/_/g;

  # shared caches use 2 levels, to speed up directory accesses --
  # because otherwise you wind up with a huge directory full of entries.
  # at least this moves it towards a tree...
  my $subdir;
  if ($use_deep_fnames && ($url =~ s/(^[-A-Za-z0-9]+)_+//)) {
    $subdir = $1;
  }

  if ($Sitescooper::Main::use_hashes_for_cache_filenames) {
    # try to limit the filename by trimming the start and adding the
    # hash value at the beginning instead. Let's hope this is not
    # too prone to collisions...
    if ($url =~ /^(.+)(.{16})$/) {
      $url = sprintf ("%4x_%s", unpack ("%16C*", $1), $2);
    }
  }

  if (!-d $dir) {
    mkdir ($dir, 0777) or die "failed to mkdir '$dir'\n";
  }

  if (defined $subdir) {
    $dir .= $SLASH.$subdir;
    if (!-d $dir) {
      mkdir ($dir, 0777) or die "failed to mkdir '$dir'\n";
    }
  }

  return $dir.$SLASH.$url;
}

sub cachefilename {
  my $self = shift;
  mk_generic_cache_filename ($self->{cachedir}, $_[0], 0);
}
sub sharedcachefilename {
  my $self = shift;
  mk_generic_cache_filename ($self->{factory}->{sharedcachedir}, $_[0], 1);
}

sub _get_cached_page {
  my ($self, $url, $is_diff_page, $is_sharedcache_page) = @_;
  local ($_);

  # if -fullrefresh is on, do not return any cached pages.
  # if it and -fromcache are both on, *do* return them.
  if ($self->{main}->{cf}->{full_refresh} &&
                        !$self->{main}->{cf}->{use_only_cache})
  { return undef; }

  my $ptr = $self->get_ptr_from_cache_file ($url,
  					$is_sharedcache_page);
  if (!defined $ptr) { return undef; }

  my $singleton = $self->{singleton};
  my $cachedpage = $singleton->get_cached_page ($ptr);
  if (!defined ($cachedpage)) { return undef; }

  my $lastmod = $singleton->{lastmod};
  $self->update_oldest_page_at_site ($url, $lastmod);

  return new Sitescooper::CacheObject ($self->{main},
				       $self, $cachedpage, $lastmod);
}

# ---------------------------------------------------------------------------

sub get_cached_page {
  my ($self, $url) = @_;
  $self->_get_cached_page ($url, 0, 0);
}

sub get_cached_page_for_diff {
  my ($self, $url) = @_;
  $self->_get_cached_page ($url, 1, 0);
}

sub get_shared_cache_page {
  my ($self, $url) = @_;
  $self->_get_cached_page ($url, 0, 1);
}

# ---------------------------------------------------------------------------

sub _cache_page {
  my ($self, $url, $redir_from, $page, $cachelater) = @_;

  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  # With the new cache singleton architecture, we only store a pointer
  # to the *real* page text in the caches. The real page text is stored
  # in a page cache directory, indexed by URL, checksum and size.
  # However, this only really becomes useful when a shared cache is in
  # use, so to speed things up in the non-shared-cache case, default
  # the sum and size to 0 unless shared cache use is enabled.

  my $sum = 0;
  my $size = 0;
  if (defined $self->{factory}->{sharedcachedir}) {
    $sum = sprintf ("%06lx", unpack("%32C*", $page));
    $size = length($page);
  }

  my $singleton = $self->{singleton};

  my $ptr = $self->get_ptr_from_cache_file ($url, 0);
  if (defined $ptr) { $singleton->dec_refcount_for_page ($ptr); }

  my $ptrfile = $singleton->save_cached_page ($url, $sum,
  			$size, $page, $self->{last_modtime}->{$url});

  # if this page is the latest version of a diffed page, don't cache it
  # immediately, as it will mean lost stories if we're interrupted.
  # Instead save the filename for renaming when the run finishes.

  $self->set_ptr_in_cache_file ($url, $ptrfile, 0);

  if (defined $self->{factory}->{sharedcachedir}) {
    my $ptr = $self->get_ptr_from_cache_file ($url, 1);
    if (defined $ptr) { $singleton->dec_refcount_for_page ($ptr); }

    # Increment the refcount, even though we just open/write/close'd the file
    # in the previous few lines. We just do it this way so that it's closed in
    # between, to minimise races with other processes using the same caches.
    my $ptrfile = $singleton->save_cached_page ($url, $sum,
			  	$size, $page, $self->{last_modtime}->{$url});

    $self->set_ptr_in_cache_file ($url, $ptrfile, 1);
  }

  if (defined $redir_from) {
    $self->_cache_page ($redir_from, undef, $page, $cachelater);
  }
}

sub cache_page_now { &_cache_page ($_[0], $_[1], $_[2], $_[3], 0); }

sub cache_page_at_commit { &_cache_page ($_[0], $_[1], $_[2], $_[3], 1); }

# ---------------------------------------------------------------------------

sub get_ptr_from_cache_file {
  my ($self, $url, $is_shared_cache_file) = @_;

  # for shared-cache files, we store the pointers in one-line files,
  # to avoid concurrency problems when multiple scoopers are running
  # using the same shared cache.
  # for private caches, store the ptrs in the mod_db.
  if ($is_shared_cache_file) {
    my $cachefile = $self->sharedcachefilename ($url);
    return undef if (!defined $cachefile);

    open (IN, "<$cachefile") or return undef;
    my $ptrfile = <IN>; chomp $ptrfile;
    close IN;
    return $ptrfile;

  } else {
    my $ptrfile = $self->{ptrs_to_commit}->{$url};
    return $ptrfile if (defined $ptrfile);

    $ptrfile = $self->{mod_db}->{'P'.$url};
    return $ptrfile if (defined $ptrfile);
  }

  undef;
}

sub set_ptr_in_cache_file {
  my ($self, $url, $pagefile, $is_shared_cache_file) = @_;

  if ($is_shared_cache_file) {
    my $cachefile = $self->sharedcachefilename ($url);
    return undef if (!defined $cachefile);

    open (OUT, ">$cachefile") or warn "cannot write to $cachefile\n";
    print OUT $pagefile,"\n";
    close OUT or warn "cannot write to $cachefile\n";

  } else {
    $self->{ptrs_to_commit}->{$url} = $pagefile;
  }
}

# ---------------------------------------------------------------------------

sub dbg {
  my $self = shift;
  $self->{main}->dbg(@_);
}

sub verbose {
  my $self = shift;
  $self->{main}->verbose(@_);
}

# ---------------------------------------------------------------------------

1;
