Stripping html tags

Andre Garzia andre at andregarzia.com
Tue Nov 13 09:26:31 EST 2007


Hello Friends,

since we're all sharing routines in here, then here goes a tricky one.
This routine should convert the htmltext content of a field to XHTML
that can be used in a standard compliant page. This routine is slow
and expensive. It is not stripping html, but since we're in the topic
of processing html tags I thought I'd share this...

function htmltexttoxhtml p1
    ## I really wish this function was not so expensive.
    ## It has so many loops in it, always checking for RegEx.
    ##
    ## It should take care of most of Revolution htmltext weirdness.
    ## --> Convert h1..h6 back to header tags (instead of FONT + SIZE
+ EXTRA PARAGRAPHS)
    ## --> Eliminate extra paragraphs and breaks (from left and from right)
    ## --> Eliminate colors, sizes, font faces and <font> tags.
    ##
    ## It is an expensive function but revolution is *fast*

    local tM, t1, t2
    ## first we normalize things into CRs...
    replace "<p></p>" & cr with cr in p1 -- popular in htmltext..
    replace "<p></p>" with cr in p1 -- don't know if this happens
without a cr...
    replace "<br />" with cr in p1
    replace "<br>" with cr in p1

    ## then we trim them...
    repeat while ((char -1 of p1 is " ") or (char -1 of p1 is cr)) -- trim end.
        delete char -1 of p1
    end repeat
    repeat while ((char 1 of p1 is " ") or (char 1 of p1 is cr ))-- trim start.
        delete char 1 of p1
    end repeat

    --replace "<p></p>" & cr with empty in p1
    --replace "<p></p>" with empty in p1
    replace cr with "<br />" in p1

    ## take care of colors... colors should go into the CSS.
    put format("color=\"([^\"]*)\"") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("color=\"%s\"", t1) with empty in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT

    ## take care of font faces... font faces should go into the CSS.
    put format("face=\"([^\"]*)\"") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("face=\"%s\"", t1) with empty in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT


    ## take care of h1
    put format("<p><font size=\"34\"><b>([^<]*)</b></font></p>") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("<p><font size=\"34\"><b>%s</b></font></p>",
t1) with format("<h1>%s</h1>", t1) in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT


    ## take care of h2
    put format("<p><font size=\"24\"><b>([^<]*)</b></font></p>") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("<p><font size=\"24\"><b>%s</b></font></p>",
t1) with format("<h2>%s</h2>", t1) in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT


    ## take care of h3
    put format("<p><font size=\"18\"><b>([^<]*)</b></font></p>") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("<p><font size=\"18\"><b>%s</b></font></p>",
t1) with format("<h3>%s</h3>", t1) in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT

    ## take care of h4
    put format("<p><font size=\"14\"><b>([^<]*)</b></font></p>") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("<p><font size=\"14\"><b>%s</b></font></p>",
t1) with format("<h4>%s</h4>", t1) in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT

    ## take care of h5
    put format("<p><font
size=\"12\"><b><i>([^<]*)</i></b></font></p>") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("<p><font
size=\"12\"><b><i>%s</i></b></font></p>", t1) with
format("<h5>%s</h5>", t1) in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT

    ## take care of size... size should go into the CSS.
    put format("size=\"([^\"]*)\"") into tREGEX
    put matchtext(p1, tREGEX, t1) into t2
    repeat while t2  is true
        replace format("size=\"%s\"", t1) with empty in p1
        put matchtext(p1, tREGEX, t1) into t2
    end REPEAT

    replace "<font >" with empty in p1
    replace "<font  >" with empty in p1
    replace "</font>" with empty in p1
    replace "<p>" with empty in p1
    replace "</p>" with cr in p1

    return p1
end htmltexttoxhtml



More information about the use-livecode mailing list