[Bug 44880] would like to be able to strip tags from html messages
Christian Schmitz
list at schweb.com.ar
Tue Aug 9 15:45:16 BST 2011
https://bugs.kde.org/show_bug.cgi?id=44880
--- Comment #18 from Christian Schmitz <list schweb com ar> 2011-08-09 14:45:15 ---
I write this script, need be refined in case of "<img" tag when is slited into
2 different lines.
This script remove any type of html tag keeping links to images as text,
links to url is keeped as text.
I hope was usefull to the comunity.
#!/usr/bin/perl
# script que convierte los emails HTML a TXT
# ideal para usar en kmail
#
#
# $block=0 // no se acarrea nada del renglon anterior
# $block=1 // se esta buscando una URL (ej: <img src=http:xxxx >) que fue
iniciado en otro renglon
# $block=2 // se esta buscando un FIN de tag HTML que fue iniciado en otro
renglon.
#
$filtrado =0; #Si ha habido alguna linea filtrada
$activo =1; #Debe parsear el mail
$special =0; #
$block =0; #Se esta borrando un block de renglones
$block_fin ="";
$str_fin =">";
@htmltags=(
"<html",
"</html",
"<body",
"</body",
"<table",
"</table",
"<tr",
"</tr",
"<td",
"</td",
"<hr",
"<pre",
"</hr",
"<b",
"</b",
"<p",
"</p",
"<!--",
"</a",
"<span",
"</span",
"<font",
"</font",
"<style",
"<script",
);
@htmltags2=(
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
">",
"</style>",
"</script>",
);
@htmlspecial1=(
"<a",
"<img",
);
@htmlspecial2=(
" href=",
" src=",
);
@htmlspecial3=(
">",
">",
);
open(IN,"/dev/stdin");
#open(IN,"kmail-samle-mail.txt");
while(<IN>){
$reng=$_;
chomp $reng;
print "\nIN :$reng\n";
$reng=~s/<br>/\n/;
if( $block==1 ){
$reng=strip_url($reng,$block_fin);
}
if( $block==2 ){
$reng=strip_fin($reng,$block_url,$block_fin);
}
if( $block==0 ){
$tag_n=0;
foreach $tag (@htmltags){
$str_fin=$htmltags2[$tag_n];
while($reng=~/$tag/){
$reng=strip($tag,$reng,$str_fin);
}
$tag_n++;
}
$tag_n=0;
while($reng=~/\<img/){
my $tag ="<img";
my $str_url ="src=";
my $str_fin =">";
$reng=strip_special($reng,$tag,$str_url,$str_fin);
}
}
# if( length($reng)>0){
print "b=$block : $reng\n";
print "$reng\n";
# };
# if( $filtrado>20){
# exit 0;
# }
print " :";
}
close IN;
if( $filtrado!=0){
print "filtrado por kmail-html-strip\n";
}
#######################################################################################
#
sub strip{
my $tag =$_[0];
my $reng =$_[1];
my $str_fin =$_[2];
my $r_len =length($reng);
my $t_len =length($tag);
my $s_len =length($str_fin);
my $inicio =0;
while(substr($reng,$inicio,$t_len) ne $tag && $inicio<= $r_len){$inicio++;}
$fin=$inicio+$s_len;
while(substr($reng,$fin,$s_len) ne $str_fin && $fin<= $r_len){$fin++;}
$reng=substr($reng,0,$inicio).substr($reng,$fin+$s_len);
if($fin>$r_len && substr($reng,$fin,$s_len) ne $str_fin){
$block_fin=$str_fin;
$block=2;
}
$filtrado++;
return $reng
}
################
#
# En caso de tag abierto antes
#
sub strip_fin{
my $reng =$_[0];
my $str_fin =$_[1];
my $r_len =length($reng);
my $fin =0;
my $s_len =length($str_fin);
# while(substr($reng,$fin,1) ne ">" && $fin<= $r_len){$fin++;}
while(substr($reng,$fin,$s_len) ne $str_fin && $fin<= $r_len){$fin++;}
$reng=substr($reng,$fin+$s_len);
if($fin>$r_len && substr($reng,$fin,1) ne ">"){
$block=2;
}else{
$block=0;
}
return $reng;
}
sub strip_url{
my $reng =$_[0];
my $str_url =$_[1];
my $str_fin =$_[2];
my $r_len =length($reng);
my $t_len =length($tag);
my $u_len =length($str_url);
my $f_len =length($str_fin);
my $url_ini =0;
my $url_fin =0;
my $tag_fin =0;
while(substr($reng,$url_ini,$u_len) ne $str_url && $url_ini<=
$r_len){$url_ini++;}
print "tag_ini=--- url_ini=$url_ini url_fin=---- tag_fin=-----
r_len=$r_len\n";
if($url_ini>$r_len && substr($reng,$url_ini,$u_len) ne $str_url){
# solo se inicio el tag sin encontrarse URL
$block_url=$str_fin;
$block_fin=$str_fin;
$block=1;
$reng="";
# $reng= substr($reng,0,$tag_ini);
return
}
$block=0;
$url_ini=$url_ini+$u_len;
$url_fin=$url_ini+$u_len+1;
while($url_fin<=$r_len){
$tp=substr($reng,$url_fin,1);
if ( $tp eq "\"" || $tp eq " " || $tp eq ">"){
last;
}
$url_fin++;
}
$tag_fin=$url_fin;
while(substr($reng,$tag_fin,$f_len) ne $str_fin && $tag_fin<=
$r_len){$tag_fin++;}
if($tag_fin>$r_len && substr($reng,$tag_fin,$f_len) ne $str_fin){
$block_fin=$str_fin;
$block=2;
}
print "tag_ini=--- url_ini=$url_ini url_fin=$url_fin tag_fin=$tag_fin
r_len=$r_len\n";
$reng =substr($reng,$url_ini,$url_fin-$url_ini).substr($reng,$tag_fin);
}
#########################################
#
#
sub strip_special{
print "special strip\n";
my $reng =$_[0];
my $tag =$_[1];
my $str_url =$_[2];
my $str_fin =$_[3];
# my $tag ="<img";
# my $str_url ="src=";
# my $str_fin =">";
my $delimiter ="";
my $r_len =length($reng);
my $t_len =length($tag);
my $u_len =length($str_url);
my $f_len =length($str_fin);
my $tag_ini =0;
my $url_ini =0;
my $url_fin =0;
my $tag_fin =0;
while(substr($reng,$tag_ini,$t_len) ne $tag && $tag_ini<=
$r_len){$tag_ini++;}
$url_ini=$tag_ini+$t_len;
while(substr($reng,$url_ini,$u_len) ne $str_url && $url_ini<=
$r_len){$url_ini++;}
if($url_ini>$r_len && substr($reng,$url_ini,$u_len) ne $str_url){
# solo se inicio el tag sin encontrarse URL
$block_url=$str_fin;
$block_fin=$str_fin;
$block=1;
$reng= substr($reng,0,$tag_ini);
} else {
$url_ini=$url_ini+$u_len;
$url_fin=$url_ini+$u_len+1;
while($url_fin<=$r_len){
$tp=substr($reng,$url_fin,1);
if ( $tp eq "\"" || $tp eq " " || $tp eq ">"){
last;
}
$url_fin++;
}
$tag_fin=$url_fin;
while(substr($reng,$tag_fin,$f_len) ne $str_fin && $tag_fin<=
$r_len){$tag_fin++;}
if($tag_fin>$r_len && substr($reng,$tag_fin,$f_len) ne $str_fin){
$block_fin=$str_fin;
$block=2;
}
$reng
=substr($reng,0,$tag_ini).substr($reng,$url_ini,$url_fin-$url_ini).substr($reng,$tag_fin+$f_len);
}
print "tag_ini=$tag_ini url_ini=$url_ini url_fin=$url_fin tag_fin=$tag_fin
r_len=$r_len\n";
# print "$reng\n";
return $reng;
}
--
Configure bugmail: https://bugs.kde.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the assignee for the bug.
More information about the Kdepim-bugs
mailing list