Friday, February 25, 2011

Converting html character to actual character

Source : http://webglimpse.net/docs/howto_uml.html

#!/usr/local/bin/perl


# converts html files into ascii by just stripping anything between
# < and >
# written 4/21/96 by Michael Smith for WebGlimpse
#
# Added code to replace html codes for special chars with the
# characters themselves. 12/19/98 --GB
#
# Also add space in place of space-producing HTML tags
# 12/22/98 --GB

$carry=0;

while(){
$line = $_;

if($carry==1){
# remove all until the first >
next if($line!~s/[^>]*>//);
# if we didn't do next, it succeeded -- reset carry
$carry=0;
}

while($line=~s/(<[^\s>][^>]*>)/&addspace($1)/ge){};
if($line=~s/<[^\s>].*$//){
$carry=1;
}

$line = &fixspecial($line);
print $line;
}


sub addspace () {

$_ = shift;

# Check for tags that should NOT return a space
/(<\/?b>)|(<\/?i>)|(<\/?em>)|(<\/?font)|(<\/?strong)|(<\/?big)/i && return '';
/(<\/?sup)|(<\/?sub)|(<\/?u>)|(<\/?strike)|(<\/?style)/i && return '';

# Otherwise, put in a space
return ' ';

}


sub fixspecial () {

$_ = shift;

s/\ / /g;
s/\ / /g;
s/\¡/¡/g;
s/\¡/¡/g;
s/\¢/¢/g;
s/\¢/¢/g;
s/\£/£/g;
s/\£/£/g;
s/\¤/¤/g;
s/\¤/¤/g;
s/\¥/¥/g;
s/\¥/¥/g;
s/\¦/¦/g;
s/\¦/¦/g;
s/\§/§/g;
s/\§/§/g;
s/\¨/¨/g;
s/\¨/¨/g;
s/\©/©/g;
s/\©/©/g;
s/\ª/ª/g;
s/\ª/ª/g;
s/\«/«/g;
s/\«/«/g;
s/\¬/¬/g;
s/\¬/¬/g;
s/\­/\\/g;
s/\­/\\/g;
s/\®/®/g;
s/\®/®/g;
s/\¯/¯/g;
s/\¯/¯/g;
s/\°/°/g;
s/\°/°/g;
s/\±/±/g;
s/\±/±/g;
s/\²/²/g;
s/\²/²/g;
s/\³/³/g;
s/\³/³/g;
s/\´/´/g;
s/\´/´/g;
s/\µ/µ/g;
s/\µ/µ/g;
s/\¶/¶/g;
s/\¶/¶/g;
s/\·/·/g;
s/\·/·/g;
s/\¸/¸/g;
s/\¸/¸/g;
s/\¹/¹/g;
s/\¹/¹/g;
s/\º/º/g;
s/\º/º/g;
s/\»/»/g;
s/\»/»/g;
s/\¼/¼/g;
s/\¼/¼/g;
s/\½/½/g;
s/\½/½/g;
s/\¾/¾/g;
s/\¾/¾/g;
s/\¿/¿/g;
s/\¿/¿/g;
s/\À/À/g;
s/\À/À/g;
s/\Á/Á/g;
s/\Á/Á/g;
s/\Â/Â/g;
s/\ˆ/Â/g;
s/\Ã/Ã/g;
s/\Ã/Ã/g;
s/\Ä/Ä/g;
s/\Ä/Ä/g;
s/\Å/Å/g;
s/\˚/Å/g;
s/\Æ/Æ/g;
s/\Æ/Æ/g;
s/\Ç/Ç/g;
s/\Ç/Ç/g;
s/\È/È/g;
s/\È/È/g;
s/\É/É/g;
s/\É/É/g;
s/\Ê/Ê/g;
s/\Ê/Ê/g;
s/\Ë/Ë/g;
s/\Ë/Ë/g;
s/\Ì/Ì/g;
s/\Ì/Ì/g;
s/\Í/Í/g;
s/\Í/Í/g;
s/\Î/Î/g;
s/\Î/Î/g;
s/\Ï/Ï/g;
s/\Ï/Ï/g;
s/\Ð/Ð/g;
s/\Ð/Ð/g;
s/\Ñ/Ñ/g;
s/\Ñ/Ñ/g;
s/\Ò/Ò/g;
s/\Ò/Ò/g;
s/\Ó/Ó/g;
s/\Ó/Ó/g;
s/\Ô/Ô/g;
s/\Ô/Ô/g;
s/\Õ/Õ/g;
s/\Õ/Õ/g;
s/\Ö/Ö/g;
s/\Ö/Ö/g;
s/\×/×/g;
s/\×/×/g;
s/\Ø/Ø/g;
s/\Ø/Ø/g;
s/\Ù/Ù/g;
s/\Ù/Ù/g;
s/\Ú/Ú/g;
s/\Ú/Ú/g;
s/\Û/Û/g;
s/\Û/Û/g;
s/\Ü/Ü/g;
s/\Ü/Ü/g;
s/\Ý/Ý/g;
s/\Ý/Ý/g;
s/\Þ/Þ/g;
s/\Þ/Þ/g;
s/\ß/ß/g;
s/\ß/ß/g;
s/\à/à/g;
s/\à/à/g;
s/\á/á/g;
s/\á/á/g;
s/\â/â/g;
s/\â/â/g;
s/\ã/ã/g;
s/\ã/ã/g;
s/\ä/ä/g;
s/\ä/ä/g;
s/\å/å/g;
s/\å/å/g;
s/\æ/æ/g;
s/\æ/æ/g;
s/\ç/ç/g;
s/\ç/ç/g;
s/\è/è/g;
s/\è/è/g;
s/\é/é/g;
s/\é/é/g;
s/\ê/ê/g;
s/\ê/ê/g;
s/\ë/ë/g;
s/\ë/ë/g;
s/\ì/ì/g;
s/\ì/ì/g;
s/\í/í/g;
s/\í/í/g;
s/\î/î/g;
s/\î/î/g;
s/\ï/ï/g;
s/\ï/ï/g;
s/\ð/ð/g;
s/\&ieth;/ð/g;
s/\ñ/ñ/g;
s/\ñ/ñ/g;
s/\ò/ò/g;
s/\ò/ò/g;
s/\ó/ó/g;
s/\ó/ó/g;
s/\ô/ô/g;
s/\ô/ô/g;
s/\õ/õ/g;
s/\õ/õ/g;
s/\ö/ö/g;
s/\ö/ö/g;
s/\÷/÷/g;
s/\÷/÷/g;
s/\ø/ø/g;
s/\ø/ø/g;
s/\ù/ù/g;
s/\ù/ù/g;
s/\ú/ú/g;
s/\ú/ú/g;
s/\û/û/g;
s/\û/û/g;
s/\ü/ü/g;
s/\ü/ü/g;
s/\ý/ý/g;
s/\ý/ý/g;
s/\þ/þ/g;
s/\þ/þ/g;
s/\ÿ/ÿ/g;
s/\ÿ/ÿ/g;
s/\"/"/g;
s/\"/"/g;

# Do the ampersand last, so it won't affect the other substitutions
s/\&/\&/g;
s/\&/\&/g;

return $_;
}