#!/usr/local/bin/perl
#############################################################
# Plik: inlinksAnalysis.pl                                  #
# Opis: Ten skrypt analizuje linki zwrotne pozyskane Yahoo! #
# z pliku TSV usugi Yahoo!                                 #
#############################################################
use LWP::Simple;
use LWP::UserAgent;
use HTML::TokeParser;
my @URLs = ();
#Pobieranie wejciowego parametru  nazwy pliku
my $fileToProcess = $ARGV[0];
my $baseurl = $ARGV[1];
print "\nPrzetwarzanie: $fileToProcess";
my $cnt = 0;
# Otwrz plik
if (-e "$fileToProcess"){
    open FILE, "$fileToProcess" or die $!;
    while (<FILE>) {
        my $line = $_;
        my @fragments = split(/\t/, $line);
        my $url = $fragments[1];
        $URLs[$cnt] = $url;
        $cnt++;
    }
} else {
    print "\nPlik ($fileToProcess) nie istnieje";
}
my $ua = new LWP::UserAgent;
my $res;
$ua->agent("My Crawler");
my %linkPopHash = ();
my %anchorPopHash = ();
for(my $i=0; $i<=$cnt; $i++) {
    $res = $ua->get("$URLs[$i]",':content_file' => "temp.txt");
    if (-e "temp.txt") {
        my $p = HTML::TokeParser->new("temp.txt");
        while (my $token = $p->get_tag("a")) {
            #Pobieranie linku i tekstu kotwiczki
            my $url = $token->[1]{href} || "-";
            my $anchorText = $p->get_trimmed_text("/a");
            $url =~ s/^\s+//g;
            $url =~ s/\s+$//g;
            my $text = $p->get_trimmed_text("/a");
            if ($url =~ /$baseurl/i) {
                #print "\n$Bazowy adres URL: $URLs[$i] LINK: $url";
                if(exists $linkPopHash{$url}){
                    $linkPopHash{$url} = $linkPopHash{$url} + 1;
                    $anchorPopHash{$url} = $anchorText;
                } else {
                    $linkPopHash{$url} = 1;
                    $anchorPopHash{$url} = $anchorText;
                }
            }
        }
    }
}
open (FP, '>report.txt');
foreach my $key ( sort { $linkPopHash{$b} <=> $linkPopHash{$a} }
keys %linkPopHash ) {
    print FP "$key, $linkPopHash{$key}, \"$anchorPopHash{$key}\"\n";
}
close (FP);