#!/usr/bin/perl # Coded by Trizen # Email: echo dHJpemVueEBnbWFpbC5jb20K | base64 -d # Website: http://trizen.go.ro use strict; sub usage { print "usage: $0 [options] [URLs] \nOptions: -m, --main : prints main URL -i, --intern : seek for intern URLs -h, --help : prints this message \nExample: $0 -i http://trizen.googlecode.com\n\n"; exit; } my $valid_url = qr"^ ################# This regex will validate an HTTP URL ################# https?:// # http or https followed by :// [[:alnum:]] # first character must be a-zA-Z0-9 (?:(?:(?:\w*-+\w+|\w+)* # words, dash, words OR only words \.(?=\w))+? # point if followed by word char | # OR (validates http://x.yz) \.) # a single dot \w{2,6} # domain (words between 2 and 6 chars) (?:[#/?!] # characters after domain [#-)+-;=?\\~\w]*)* # the rest of characters of the string $"x; my @main_urls; my $print_main_url = 0; my $get_intern_urls = 0; foreach my $arg (@ARGV) { if ($arg =~ /^-+m/) { $print_main_url = 1; } elsif ($arg =~ /^-+h/) { usage; } elsif ($arg =~ /^-+i/) { $get_intern_urls = 1; } elsif ($arg =~ /$valid_url/o) { push @main_urls, $arg; } } usage() unless @main_urls; require LWP::UserAgent; my $lwp = 'LWP::UserAgent'->new('timeout', 10, 'env_proxy', 1); $lwp->agent('Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7'); sub get_links { my ($url) = @_; my %found_urls; if ($print_main_url) { print "\n\n=>> Mainurl: $url\n"; } foreach my $line (split(/\b(?=href\s*=|url\s*=)/i, $lwp->get($url)->content, 0)) { if ($line =~ /\bhttp:[^"'\n<>]+/p) { my $found_url = ${^MATCH}; $found_url = edit_url($found_url); if ($found_url =~ /$valid_url/o) { ++$found_urls{$found_url}; } } if ($get_intern_urls) { my ($main_url_domain) = $url =~ m[^https?://([^/]+)]; if ($line =~ /\b(?:href|url)\s*=\s*['"]?(?!http)([^'"\s<>]+)/i) { my $found_path = $1; my $found_url = $main_url_domain =~ /^\Q$found_path\E/ ? $found_path : "$main_url_domain/$found_path"; substr($found_url, 0, 0, $1) if $url =~ m{^(https?://)}; $found_url = edit_url($found_url); if ($found_url =~ /$valid_url/o) { ++$found_urls{$found_url}; } } } } print join("\n", sort(keys %found_urls)), "\n"; } sub edit_url { my ($self) = shift; $self =~ s{[/\\]+$}{}; $self =~ s{(?