[Bug 44880] would like to be able to strip tags from html messages

Christian Schmitz list at schweb.com.ar
Tue Aug 9 15:45:16 BST 2011


https://bugs.kde.org/show_bug.cgi?id=44880





--- Comment #18 from Christian Schmitz <list schweb com ar>  2011-08-09 14:45:15 ---
I write this script, need be refined in case of "<img" tag when is slited into
2 different lines.

This script remove any type of html tag keeping links to images as text, 
links to url is keeped as text.

I hope was usefull to the comunity. 

#!/usr/bin/perl
# script que convierte los emails HTML a TXT
# ideal para usar en kmail
#
#
# $block=0  // no se acarrea nada del renglon anterior
# $block=1  // se esta buscando una URL (ej: <img src=http:xxxx >) que fue
iniciado en otro renglon
# $block=2  // se esta buscando un FIN de tag HTML que fue iniciado en otro
renglon.
#
$filtrado    =0;    #Si ha habido alguna linea filtrada
$activo        =1;    #Debe parsear el mail
$special    =0;    #
$block        =0;     #Se esta borrando un block de renglones
$block_fin    ="";
$str_fin    =">";
@htmltags=(
    "<html",
    "</html",
    "<body",
    "</body",
    "<table",
    "</table",
    "<tr",
    "</tr",
    "<td",
    "</td",
    "<hr",
    "<pre",
    "</hr",
    "<b",
    "</b",
    "<p",
    "</p",
    "<!--",
    "</a",
    "<span",
    "</span",
    "<font",
    "</font",
    "<style",
    "<script",
    );
@htmltags2=(
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    ">",
    "</style>",
    "</script>",
    );
@htmlspecial1=(
    "<a",
    "<img",
    );
@htmlspecial2=(
    " href=",
    " src=",
    );
@htmlspecial3=(
    ">",
    ">",
    );

open(IN,"/dev/stdin");
#open(IN,"kmail-samle-mail.txt");
while(<IN>){
    $reng=$_;
    chomp $reng;
    print "\nIN   :$reng\n";
    $reng=~s/<br>/\n/;
    if( $block==1 ){
            $reng=strip_url($reng,$block_fin);
    }
    if( $block==2 ){
            $reng=strip_fin($reng,$block_url,$block_fin);
    }
    if( $block==0 ){
        $tag_n=0;
         foreach $tag (@htmltags){
             $str_fin=$htmltags2[$tag_n];
            while($reng=~/$tag/){
                $reng=strip($tag,$reng,$str_fin);
            }
            $tag_n++;
         }
        $tag_n=0;
        while($reng=~/\<img/){
            my $tag        ="<img";
            my $str_url    ="src=";
            my $str_fin    =">";
            $reng=strip_special($reng,$tag,$str_url,$str_fin);
        }

    }
#    if( length($reng)>0){
        print "b=$block     : $reng\n";
        print "$reng\n";
#    };
#     if( $filtrado>20){
#    exit 0;
#     }
    print "       :";
}
close IN;
if( $filtrado!=0){
    print "filtrado por kmail-html-strip\n";
}
#######################################################################################
#

sub strip{
    my $tag     =$_[0];
    my $reng    =$_[1];
    my $str_fin    =$_[2];
    my $r_len    =length($reng);
    my $t_len    =length($tag);
    my $s_len    =length($str_fin);
    my $inicio    =0;
    while(substr($reng,$inicio,$t_len) ne $tag && $inicio<= $r_len){$inicio++;}
    $fin=$inicio+$s_len;
    while(substr($reng,$fin,$s_len) ne $str_fin && $fin<= $r_len){$fin++;}

    $reng=substr($reng,0,$inicio).substr($reng,$fin+$s_len);

    if($fin>$r_len && substr($reng,$fin,$s_len) ne $str_fin){
        $block_fin=$str_fin;
        $block=2;
    } 
    $filtrado++;    

return $reng
}
################
#
# En caso de tag abierto antes
#
sub strip_fin{
    my $reng      =$_[0];
    my $str_fin    =$_[1];
    my $r_len     =length($reng);
    my $fin     =0;
    my $s_len    =length($str_fin);
#    while(substr($reng,$fin,1) ne ">" && $fin<= $r_len){$fin++;}
    while(substr($reng,$fin,$s_len) ne $str_fin && $fin<= $r_len){$fin++;}
    $reng=substr($reng,$fin+$s_len);
    if($fin>$r_len && substr($reng,$fin,1) ne ">"){
        $block=2;
    }else{
        $block=0;
    } 
    return $reng;
}

sub strip_url{
    my $reng      =$_[0];
    my $str_url    =$_[1];
    my $str_fin    =$_[2];
    my $r_len    =length($reng);
    my $t_len    =length($tag);
    my $u_len    =length($str_url);
    my $f_len    =length($str_fin);
    my $url_ini    =0;
    my $url_fin    =0;
    my $tag_fin    =0;
    while(substr($reng,$url_ini,$u_len) ne $str_url && $url_ini<=
$r_len){$url_ini++;}
    print "tag_ini=---  url_ini=$url_ini url_fin=---- tag_fin=-----
r_len=$r_len\n";
    if($url_ini>$r_len && substr($reng,$url_ini,$u_len) ne $str_url){
        # solo se inicio el tag sin encontrarse URL
        $block_url=$str_fin;
        $block_fin=$str_fin;
        $block=1;
        $reng="";
#        $reng= substr($reng,0,$tag_ini);
        return
    } 
    $block=0;
    $url_ini=$url_ini+$u_len;
    $url_fin=$url_ini+$u_len+1;
    while($url_fin<=$r_len){
            $tp=substr($reng,$url_fin,1);
            if ( $tp eq "\"" || $tp eq " " || $tp eq ">"){
                last;
            }
            $url_fin++;
    }
    $tag_fin=$url_fin;
    while(substr($reng,$tag_fin,$f_len) ne $str_fin && $tag_fin<=
$r_len){$tag_fin++;}
    if($tag_fin>$r_len && substr($reng,$tag_fin,$f_len) ne $str_fin){
            $block_fin=$str_fin;
            $block=2;
    }        
    print "tag_ini=---  url_ini=$url_ini url_fin=$url_fin tag_fin=$tag_fin
r_len=$r_len\n";
    $reng =substr($reng,$url_ini,$url_fin-$url_ini).substr($reng,$tag_fin);

}    
#########################################
#
#
sub strip_special{
    print "special strip\n";
    my $reng      =$_[0];
    my $tag        =$_[1];
    my $str_url    =$_[2];
    my $str_fin    =$_[3];
#     my $tag        ="<img";
#     my $str_url    ="src=";
#     my $str_fin    =">";
    my $delimiter    ="";

    my $r_len    =length($reng);
    my $t_len    =length($tag);
    my $u_len    =length($str_url);
    my $f_len    =length($str_fin);

    my $tag_ini    =0;
    my $url_ini    =0;
    my $url_fin    =0;
    my $tag_fin    =0;
    while(substr($reng,$tag_ini,$t_len) ne $tag && $tag_ini<=
$r_len){$tag_ini++;}
    $url_ini=$tag_ini+$t_len;
    while(substr($reng,$url_ini,$u_len) ne $str_url && $url_ini<=
$r_len){$url_ini++;}
    if($url_ini>$r_len && substr($reng,$url_ini,$u_len) ne $str_url){
        # solo se inicio el tag sin encontrarse URL
        $block_url=$str_fin;
        $block_fin=$str_fin;
        $block=1;
        $reng= substr($reng,0,$tag_ini);
    } else {
        $url_ini=$url_ini+$u_len;
        $url_fin=$url_ini+$u_len+1;
        while($url_fin<=$r_len){
            $tp=substr($reng,$url_fin,1);
            if ( $tp eq "\"" || $tp eq " " || $tp eq ">"){
                last;
            }
            $url_fin++;
        }
        $tag_fin=$url_fin;
        while(substr($reng,$tag_fin,$f_len) ne $str_fin && $tag_fin<=
$r_len){$tag_fin++;}
        if($tag_fin>$r_len && substr($reng,$tag_fin,$f_len) ne $str_fin){
            $block_fin=$str_fin;
            $block=2;
        }        
        $reng
=substr($reng,0,$tag_ini).substr($reng,$url_ini,$url_fin-$url_ini).substr($reng,$tag_fin+$f_len);
    }
    print "tag_ini=$tag_ini  url_ini=$url_ini url_fin=$url_fin tag_fin=$tag_fin
r_len=$r_len\n";
#    print "$reng\n";

return $reng;
}

-- 
Configure bugmail: https://bugs.kde.org/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the assignee for the bug.



More information about the Kdepim-bugs mailing list