#! /usr/bin/perl # # fetch - Utility for fetching internet objects by following links on web # Copyright (C) 2004 Tommi Saviranta # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # Version: fetch v0.1.3 24-Nov-2004 wnd@iki.fi # BUGS: # - Cookie-support has some bugs. This implementation works with my comics. # If you can't live with it, you fix it. I'm having a day off now. use strict; use warnings; use Socket; my $__debug = 0; # 1 progress, 2 connect, 4 dump (text) my ($__referer, $__redirect, $__socket, $__page_uri); my (@__cookie_domain, @__cookie_path, @__cookie_data); my $rulefile = "/home/user/fetch/rules"; &main(); sub main { my ($uri, $page); my ($title, $cmd, $data); my $debug = 0; open(FILE, "< $rulefile") or die "Can't read rules!"; while () { # Remove linefeed, comments and trailing whitespaces chop; s/\\#/\n/; # Temporarily convert escaped #s into lf $data = $_; # Save $_ ($_) = /(.*?)#/s; # Strip comments $_ = $data if (! defined($_)); # No comment? Restore backup. ($_) = /(.*?)\s*$/s; # Strip whitespaces s/\n/#/s; # Put #s back next if (! /./); # Skip if empty line # Get command and data-part ($cmd, $data) = /(.*?)\s+(.*)/; # Command + data? $cmd = $_ if (! defined($cmd)); if ($cmd eq "title") { undef($__referer); &clear_cookies(); $title = $data; print "Getting $title...\n"; } elsif ($cmd eq "url") { $uri = $data; } elsif ($cmd eq "fetch") { $page = &http_get($uri, 80); } elsif ($cmd eq "get") { # The heart of whole thing... my ($opt, $param); ($opt, $param) = $data =~ /(.*?)\s+(.*)/; $uri = &get($opt, $param, $page); } elsif ($cmd eq "save") { if (! defined($uri)) { print "Didn't find URI!\n"; } else { print "Save $uri as $data.\n" if ($debug & 1); $page = &http_get($uri, 80); &save_page($data, $page); } } elsif ($cmd eq "stop") { die "stopped"; } else { die "Unrecognized command: $cmd"; } } close(FILE); } # main exit(0); ### get # 0: options # 1: parameters # 2: page content # sub get { my $mopt = $_[0]; my $mparam = $_[1]; my $page = $_[2]; my ($uri, $t0, $t1); my ($otag, $etag, $opt); $mparam =~ s/ /.*?/; if ($mopt eq "image") { $otag = "; if ($phase == 0) { $header .= $_; ($size) = /: (\d+)/ if (/^Content-Length:\s/); ($type) = /: ([^\s]*)/ if (/^Content-Type:\s/); } else { $data .= $_; $done += length($_); } $phase++ if ($phase == 0 && /^\r?\n/); if (($__debug & 4) && ($type =~ /text/ || $phase == 0)) { print STDERR "<< $_"; } elsif (($__debug & 1) && $size > 0) { printf STDERR "\rGetting file: %d/%d...", $done, $size; print STDERR " done.\n" if ($done == $size); } } return ($header, $data); } # __read_socket ### __write_socket # 0: data # sub __write_socket { print $__socket $_[0]; print STDERR ">> $_[0]" if ($__debug & 4); } # __write_socket ### http_get # 0: uri # 1: port # 2: no-track # sub http_get { my ($base) = $_[0] =~ /[^:]*:\/\/([^\/]*)/; my $port = defined($_[1]) ? $_[1] : 80; my ($header, $data); my $newuri; my $uri = $_[0]; do { ($header, $data) = &__http_get_real($uri, $port, $_[2]); $newuri = &__get_redirect($uri, $port, $header); if (defined($newuri)) { $uri = $newuri; print STDERR "Redirecting: $uri...\n" if ($__debug & 4); } else { undef($newuri); } } while (defined($newuri)); $__page_uri = $uri; # Needed in complete_uri() return $data; } # http_get ### __http_get_real # 0: uri # 1: port # 2: no-track # sub __http_get_real { my ($base, $uri) = $_[0] =~ /[^:]*:\/\/([^\/]*)(.*)/; my ($header, $data); &__connect($base, $_[1]); &__write_socket("GET $uri HTTP/1.0\r\n"); &__write_socket("Host: $base\r\n") if (! defined($_[2])); &__write_socket("Referer: http://$__referer\r\n") if (defined($__referer)); &__send_cookies($base, $uri); &__write_socket("\r\n"); $__referer = $base . $uri if (! defined($_[2])); ($header, $data) = &__read_socket(); &__close_socket(); &__read_cookies($header); return ($header, $data); } # __http_get_real ### http_post # 0: uri # 1: port # 2: data # 3: no-track # sub http_post { my ($base) = $_[0] =~ /[^:]*:\/\/([^\/]*)/; my $port = defined($_[1]) ? $_[1] : 80; my ($header, $data); my $uri = $_[0]; my $newuri; $__page_uri = $uri; # Needed in complete_uri() ($header, $data) = &__http_post_real($uri, $port, $_[2], $_[3]); $newuri = &__get_redirect($uri, $port, $header); ($header, $data) = &http_get($newuri, 80) if (defined($newuri)); return $data; } # http_post ### __http_post_real # 0: uri # 1: port # 2: data # 3: no-track # sub __http_post_real { my ($base, $uri) = $_[0] =~ /.*:\/\/([^\/]*)(.*)/; my ($header, $data); &__connect($base, $_[1]); &__write_socket("POST $uri HTTP/1.0\r\n"); &__write_socket("Host: $base\r\n") if (! defined($_[3])); &__write_socket("Referer: http://$__referer\r\n") if (defined($__referer)); &__send_cookies($base, $uri); &__write_socket("Content-Type: application/x-www-form-urlencoded\r\n"); &__write_socket("Content-Length: " . length($_[2]) . "\r\n"); &__write_socket("\n"); &__write_socket($_[2]); $__referer = $base . $uri if (! defined($_[3])); ($header, $data) = &__read_socket(); print STDERR "\n" if ($__debug & 4); # final line-feed, &__close_socket(); &__read_cookies($header); return ($header, $data); } # __http_post_real ### __get_redirect # 0: uri # 1: port # 2: data sub __get_redirect { my ($base) = $_[0] =~ /.*?:\/\/(.*?)\//; my $uri; if ($_[2] =~ /^HTTP\/\d\.\d 30[12]/) { $__redirect++; die "Too many redirects !" if ($__redirect == 17); ($uri) = $_[2] =~ /\nLocation: (.*?)[\r\n]/; $uri = "$base$uri" if ($uri =~ /^\//); $uri = "http://$uri" if (! ($uri =~ /^http/)); } else { $__redirect = 0; } return $uri; } # __get_redirect ### clear_cookies # no parameters # sub clear_cookies { @__cookie_data = (); @__cookie_domain = (); @__cookie_path = (); } # clear_cookies ### __read_cookies # 0: page # sub __read_cookies { my ($data, $domain, $path); my $page = $_[0]; my ($foo, $bar, $back); while ($page =~ /Set-Cookie: /i) { ($_, $page) = $page =~ /.*?\nSet-Cookie: (.*?)[\r?\n](.*)/s; ($data, $_) = /(.*?)(;.*)/; $_ = " $_ "; $back = $_; ($foo, $domain, $bar) = /(.*;)domain=([^;]*)(.*)$/i; $_ = defined($domain) ? " $foo $bar " : $back; $domain = "" if (! defined($domain)); ($foo, $path, $bar) = /(.*;)path=([^;]*)(.*)$/i; $path = "" if (! defined($path)); if (defined($data) && $data =~ /=deleted$/i) { my $key; ($key) = $data =~ /(.*?)=/; for (my $i = 0; $i <= $#__cookie_data; $i++) { if ($__cookie_data[$i] =~ /^$key/ && $__cookie_domain[$i] eq $domain && $__cookie_path[$i] eq $path) { splice(@__cookie_domain, $i, 1); splice(@__cookie_path, $i, 1); splice(@__cookie_data, $i, 1); } } } else { push(@__cookie_domain, $domain); push(@__cookie_path, $path); push(@__cookie_data, $data); } } } # __read_cookies ### __send_cookies # 0: domain # 1: path # sub __send_cookies { my ($domain, $path); my $cookie = ""; for (my $i = 0; $i <= $#__cookie_domain; $i++) { $domain = $__cookie_domain[$i]; $path = $__cookie_path[$i]; if (($_[0] =~ /$domain$/i) && ($_[1] =~ /^$path/)) { $cookie .= $__cookie_data[$i] . "; "; } } if ($cookie =~ /./) { ($cookie) = $cookie =~ /(.*); /; &__write_socket("Cookie: $cookie\r\n"); } } # __send_cookies ### save_page # 0: filename # 1: page content # sub save_page { open(my $fd, "> $_[0]") or return -1; print $fd $_[1]; close($fd); print "" . length($_[1]) . " byte(s) written.\n"; return 0; } # save_page ### complete_uri # 0: uri # sub complete_uri { my $uri = $_[0]; my ($npath); my ($domain, $path) = $__referer =~ /([^\/]*)(.*)/; ($path) = $path =~ /(.*)\// if ($path =~ /\//); $_ = $_[0]; if (/^\//) { return "http://$domain$_"; } if (/^http:\/\//) { # http://... if (/http:\/\/[^\.]*\//) { # URL is like http://foo/bar.index ($_) = /\/\/(.*?)\//; } else { # URL is like http://www.foobar.com/... return $_[0]; } } if (! /^\//) { # ../... while (/^\.\.\//) { ($path) = $path =~ /(.*)\// if (defined($path)); ($_) = /...(.*)/; } $path = "" if (! defined($path)); } $_ = "/$_" if (! /^\//); return "http://$domain$path$_"; } # complete_uri