#!/usr/bin/perl # # Remove duplicated lines from a file with many sort options. # Coded by Trizen under the GPL on 14 December 2011 # Latest edit on 22 December 2011 # # Website: http://trizen.googlecode.com # # Usage example: perl resdup -s input.txt -o output.txt ################## # Appname: resdup # Version: 0.0.4 ################## use strict; sub usage { print "usage: $0 [options] input.txt -o output.txt \nOptions: -s : case-sensitive sorted output (a-z) -S : same as above, but reversed (z-a) -i : case-insensitive sorted output (a-z) -I : same as above, but reversed (z-a) -n : numeric sorted output (0-9) -N : same as above, but reversed (9-0) -m : delete dups by insensitive match -M : same as above, but reversed -r : random output (may be faster) -w : case-sensitive sort, but ignore edge whitespaces -W : case-insensitive sort, but ignore edge whitespaces -f : no sorting (keeps first duplicated item) (default) -l : no sorting (keeps last duplicated item) (slow)\n\n"; exit 0; } my $in_file; # input filename my $out_file; # output filename my $in_fh; # input file handle my $out_fh; # output file handle my %type_of_; $type_of_{'sort'} = ['keep_first_duplicated_item', 0]; # default sort method while (@ARGV) { my $arg = shift @ARGV; if ($arg eq '-h') { usage(); } elsif ($arg eq '-s' or $arg eq '-S') { $type_of_{'sort'} = ['sensitive_sort', $arg]; } elsif ($arg eq '-i' or $arg eq '-I') { $type_of_{'sort'} = ['insensitive_sort', $arg]; } elsif ($arg eq '-n' or $arg eq '-N') { $type_of_{'sort'} = ['numeric_sort', $arg]; } elsif ($arg eq '-m' or $arg eq '-M') { $type_of_{'sort'} = ['matching_sort', $arg]; } elsif ($arg eq '-w' or $arg eq '-W') { $type_of_{'sort'} = ['ignore_white_spaces', $arg]; } elsif ($arg eq '-r') { $type_of_{'sort'} = ['random_sort', 0]; } elsif ($arg eq '-f') { $type_of_{'sort'} = ['keep_first_duplicated_item', 0]; } elsif ($arg eq '-l') { $type_of_{'sort'} = ['keep_last_duplicated_item', 0]; } elsif (not defined $in_file and -f $arg) { $in_file = $arg; } elsif ($arg eq '-o') { $out_file = shift @ARGV // _die(undef); } } if (defined $in_file and not defined $out_file) { $out_fh = *STDOUT; } if (not defined $in_file) { usage(); } sub _die { my $file = shift @_; if (defined $file) { die "Unable to open '${file}': $!"; } else { print "error: input/output files must be specified\n\n"; usage(); } } if (defined $out_file and $in_file eq $out_file) { die "Try a different name for output file!\n"; } else { open $in_fh, '<', $in_file or _die($in_file); if (!defined $out_fh) { open $out_fh, '>', $out_file or _die($out_file); } } eval "$type_of_{'sort'}->[0]('$type_of_{'sort'}->[1]')" or die "$0 error: $@\n"; sub check_sub_argument { $type_of_{'sort'}->[1] eq uc shift() ? 1 : 0; } sub sensitive_sort { my $reversed = check_sub_argument(@_); my %seen; @seen{<$in_fh>} = (); if ($reversed) { print {$out_fh} sort { $b cmp $a } keys %seen; } else { print {$out_fh} sort keys %seen; } return 1; } sub insensitive_sort { my $reversed = check_sub_argument(@_); my %seen; @seen{<$in_fh>} = (); print {$out_fh} sort { $reversed ? (lc $b cmp lc $a) : (lc $a cmp lc $b) } keys %seen; return 1; } sub numeric_sort { my $reversed = check_sub_argument(@_); my %seen; @seen{<$in_fh>} = (); print {$out_fh} sort { $reversed ? ($b <=> $a) : ($a <=> $b) } keys %seen; return 1; } sub matching_sort { my $reversed = check_sub_argument(@_); my $last_line = ''; foreach my $line ( sort { lc $a cmp lc $b } do { $reversed ? reverse <$in_fh> : <$in_fh> } ) { next if $last_line =~ /^\Q$line\E$/i; print {$out_fh} $line; $last_line = $line; } return 1; } sub random_sort { my %seen; @seen{<$in_fh>} = (); print {$out_fh} keys %seen; return 1; } sub keep_first_duplicated_item { print {$out_fh} do { my %seen; grep { !$seen{$_}++ } <$in_fh>; }; return 1; } sub keep_last_duplicated_item { my @data = <$in_fh>; while (@data) { my $line = shift @data; print {$out_fh} $line unless $line ~~ \@data; } return 1; } sub ignore_white_spaces { my $insensitive_case = check_sub_argument(shift @_); my $l = 1_000_000_000; # max length of the line print {$out_fh} sort { $insensitive_case ? do { lc reverse unpack("A$l", reverse unpack("A$l", $a)) } cmp do { lc reverse unpack("A$l", reverse unpack("A$l", $b)) } : do { scalar reverse unpack("A$l", reverse unpack("A$l", $a)) } cmp do { scalar reverse unpack("A$l", reverse unpack("A$l", $b)) } } do { my %seen; grep { !$seen{$_}++ } <$in_fh>; }; return 1; } close $in_fh; close $out_fh;