#! /usr/bin/perl -w

package extract_links;

use strict;

#Input: the text of a page in the form of a string.
#Output: a list containing the set of unique links in the page
#        if you need to count the number of links to a target you
#        need to modify the code.
sub extract_links{
  my $url = shift @_;
  my $page_text = shift(@_);
  
  my $DEBUG = 0; #when set to 1 prints the links
  my @link_set = ();
  my %Links = ();
  my $counter = 0;
  while ($page_text =~ /<a\s*href\s*=\s*\"?(.+?)(\s|>|\")/ogi){
    #my $dirty_link = $&;
    #$dirty_link=~ /\".*/i;
    #my $clean_link = $&;
    #my @garbage;
    #($clean_link,@garbage) = split(">",$clean_link);
    #($clean_link,@garbage) = split("\"",$clean_link);
    my $clean_link = $1;
    print "$clean_link\n"
    if ($clean_link =~ /(\.ps|\.pdf|\.gz|\.zip|\.Z)$/ || 
	$clean_link =~ /cgi|doubleclick|download/){
      next;
    }
    
    $clean_link =~ s/(index\.htm|index\.html)$//i ;
    $clean_link =~ s/\/$//;
    if (!defined($clean_link) || exists($Links{$clean_link})){
      next;
    }

    $Links{$clean_link} = 1;
    #$Links{$clean_link}{"position"} = pos $page_text; #this stores the position of the link in the page
                                                       #only the first one if no multiges are created
    #$Links{$raw_clean}{"index"} = $counter; 
    push @link_set, $clean_link;
    $counter ++ ;
    print "\t$clean_link (",pos($page_text),")\n" unless $DEBUG == 0;
  }
  return @link_set;

}
