#!/usr/bin/perl

# >> dieses skript gehoert andrea stubbe, 		<<
# >> darf aber von netten menschen ausgeliehen 	<<
# >> und benutzt werden.						<<
# >> as79@gmx.de 								<<

# guesses charset of <file>
# usage: perl charset_guesser.pl <file> [efga]

# von	bis	Binrdaten
# 0000	007F	0xxx.xxxx
# 0080	07FF	110x.xxxx 10xxxxxx
# 0800	FFFF	1110.xxxx 10xxxxxx 10xxxxxx

no utf8;
use strict;
use constant DEBUG => 0;			# 0: off, 1: on, 2: a bit

my $iso = 0;
my $iso15 = 0;
my $win = 0;
my $mac = 0;
my $utf8 = 0;
my $utf8_lf = 0;
my $noAscii = 0;

undef $/;
my $file = $ARGV[0] || die "\nPlease specify a file!\n(Usage: perl charset_guesser.pl <file> [efg])\n\n";
open( FH, $ARGV[0] ) or die "could not find $file!";
my $text     = <FH>;
close(FH);

my @bytes    = split( //, $text );
my $b;
my $follower = 0;
my $complete = "";

# get language: g,e,f (german, english, french, all [default])
my $lang = $ARGV[1] || "a";
#print "Language: $lang\n"	if DEBUG == 1;
	
# look at bytes
foreach $b (@bytes) {
	#last if getDif() > 3;		# early exit
	
	my $byte = sprintf "%08b", ord($b);
#	print $byte."-"			if DEBUG == 1;
	if ($byte =~ /^1/o) {
		# no ascii
		$noAscii++;
		# lead & follow
		if ($byte =~ /^11(0|(10))/o) {
			$complete = $byte;
			$follower = 1 if $1 eq "0";
			$follower = 2 if $1 eq "10";
			$utf8_lf += 0.5;	# wofr war das??
		}
		if ($byte =~ /^10/ && $follower-- > 0) {
			$complete .= $byte;
#			print "* ",length($complete),"\n";
			if (length($complete) == 24) {
				$utf8 += 3;
			} elsif (length($complete) == 16) {
				$utf8 += 1.5;
			}
		}
		# german: ш
		if ($lang eq "g" || $lang eq "a") {
			if ($byte =~ /^((11100100)|(11110110)|(11111100)|(11000100)|(11010110)|(11011100)|(11011111))$/o) {
				$iso++;
				$win++;
			} elsif ($byte =~ /^((10001010)|(10011010)|(10011111)|(10000000)|(10000101)|(10000110)|(10100111))$/o) {
				$mac++;
			} elsif ($follower == 0 && $complete =~ /^((1100001110100100)|(1100001110110110)|(1100001110111100)|(1100001110000100)|(1100001110010110)|(1100001110011100)|(1100001110011111))$/o) {
				$utf8++;
			}
		} 
		# french: , ,  und c-cedille, oe-ligatur
		if ($lang eq "f" || $lang eq "a") {
			# jeweils 3A, AE, C cedille, 3E, I circ, 3O, 3U, OE (falls vorhanden)
			my $isoBig = qr/1100(0000|0001|0010)|11000110|11000111|1100(1000|1001|1010)|11001110|1101(0010|0011|0100)|1101(1001|1010|1011)/o; 
			my $isoSm = qr/1110(0000|0001|0010)|11100110|11100111|1110(1000|1001|1010)|11101110|1111(0010|0011|0100)|1111(1001|1010|1011)/o;
			my $iso15Big = qr/10111100/o;
			my $iso15Sm = qr/10111101/o;
			
			my $macBig = qr/111001(01|11)|11001010|10011110|10000010|1110(0110|1001)|10000011|11101011|1110111(0|1)|11110001|1111(0010|0011|0100)|11001110/o;
			my $macSm = qr/1000(0111|1000|1001)|10111110|10001101|1000(1110|1111)|10010000|10010100|1001(0111|1000|1001)|1001(1100|1101|1110)|11001111/o;

			my $winBig = qr/10001100/o;
			my $winSm = qr/10011100/o;
			my $uniBig = qr//o;
			my $uniSm = qr//o;
						
			if ($byte =~/^($macBig)$/o) {
				$mac += 0.4;	# mac geringer gewichten
			}
			if ($byte =~/^($macSm)$/o) {
				$mac += 0.8;	# mac geringer gewichten
			}
			if ($byte =~/^($isoBig)$/o) {
				$iso += 0.5;
				$win += 0.5;
			}
			if ($byte =~/^($isoSm)$/o) {
				$iso += 1;
				$win += 1;
			}
			if ($byte =~/^($iso15Big)$/o) {
				$iso15 += 0.5;
			}
			if ($byte =~/^($iso15Sm)$/o) {
				$iso15 += 1;
			}
			if ($byte =~/^($iso15Big)$/o) {
				$win += 0.5;
			}
			if ($byte =~/^($iso15Sm)$/o) {
				$win += 1;
			}

			if ($follower == 0 && $complete =~ /^(1100001110100001|1100001110100000|1100001110100010|1100001110000001|1100001110000000|1100001110000010|1100001110101001|1100001110101000|1100001110101010|1100001110001001|1100001110001000|1100001110001010|1100001110101101|1100001110101100|1100001110101110|1100001110001101|1100001110001100|1100001110001110|1100001110110011|1100001110110010|1100001110110100|1100001110010011|1100001110010010|1100001110010100|1100001110111010|1100001110111001|1100001110111011|1100001110011010|1100001110011001|1100001110011011|1100001110100111|1100001110000111|1101010010000000)$/o) {
				$utf8++;
			}
		}
		
		# special chars: Euro, TM, R, "-stuff

		#0000	007F	0xxx.xxxx
		#0080	07FF	110x.xxxx 10xxxxxx
		#0800	FFFF	1110.xxxx 10xxxxxx 10xxxxxx
		
		if ($byte =~ /^(10101110|10111010|10111011)$/o) {	# R
			$win++;
			$iso++;
			$utf8++;
		} elsif ($byte =~/^(10000000|10011001|10000010|10000100|10001011|10011011)$/o) {	# EURO, TM, "
			$win++;
		} elsif ($byte =~ /^(10010001|10010010|10010011|10010100|10010110|10010111|10000101)$/o) { # '"- ...
			$win++;
			$iso++;
		} elsif ($byte =~ /^(11001010|11001001|11010000|11010001)$/o) { # - nobr, ...
			$mac += 0.8;
		} elsif ($byte =~ /^10100100$/o) {	# EURO
			$iso15++;
		} elsif ($byte =~ /^(11011011|10101010|10101000|11100010|11100011|11011100|11011101|11010010|10010011|10010100|10010101|11000111|11001000)$/o) {	# EURO, TM, R, "
			$mac += 0.8;
		} elsif ($follower == 0 && $complete =~ /^(111000101000001010101100|111000101000010010100010|111010001000001010101101|111010001000000010011110|111010001000000010111001|111010001000000010011000|111010001000000010011001|1110100010000000100011100|1110100010000000100011101|111010001000000010111010)$/o) { # EURO, TM, "
			$utf8++;
		}
		
	} # end if /^1/
}

# guess & print it
$iso15 += $iso;
print "\nISO: $iso, WIN: $win, MAC: $mac, UTF8: $utf8\n"	if DEBUG == 1;
print "$file  \t:"	if DEBUG == 2;
if ($noAscii == 0) {
	print "ASCII";
} elsif ($iso >= $win && $iso >= $iso15 && $iso > $utf8 && $iso >= $mac) {
	print "ISO-1";
} elsif ($win > $iso15 && $win > $utf8 && $win > $mac) {
	print "WINDOWS";
} elsif ($utf8 > $iso15 && $utf8 >= $mac) { 
	print "UTF8";
} elsif ($mac > $iso15) {
	print "MAC";
} else {
	print "ISO-15";
}
print "\n";

sub getDif {
	my $max = 0;
	my $sec = 0;
	if ( $utf8 > $sec ) {
		if ( $utf8 > $max ) {
			$max = $utf8;
		} else {
			$sec = $utf8;
		}
	}
	if ( $win > $sec ) {
		if ( $win > $max ) {
			$max = $win;
		} else {
			$sec = $win;
		}
	}
	if ( $mac > $sec ) {
		if ( $mac > $max ) {
			$max = $mac;
		} else {
			$mac = $win;
		}
	}
	if ( $iso > $sec ) {
		if ( $iso > $max ) {
			$max = $iso;
		} else {
			$sec = $iso;
		}
	}
	if ( $iso15 > $sec ) {
		if ( $win > $iso15 ) {
			$max = $iso15;
		} else {
			$sec = $iso15;
		}
	}
	return $max - $sec;
}
