oracle:fixhtml
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
oracle:fixhtml [2013/04/05 16:30] – rlunaro | oracle:fixhtml [2024/10/05 17:05] (current) – rlunaro | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Fix or Sanitize HTML ====== | ||
+ | |||
+ | Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. | ||
+ | |||
+ | To the point. This snippet: | ||
+ | |||
+ | < | ||
+ | select dirty, strip_html(dirty) from dual; | ||
+ | </ | ||
+ | |||
+ | Removes all the HTML tags from the html code. But this one: | ||
+ | |||
+ | < | ||
+ | select dirty, strip_html(dirty, | ||
+ | </ | ||
+ | |||
+ | Wipes out all the garbage who is in the html code, leaving it --more or less-- " | ||
+ | |||
+ | <code plsql> | ||
+ | |||
+ | | ||
+ | to_cvs in number default 0) | ||
+ | return clob is out clob ; | ||
+ | | ||
+ | type arr_string is varray (200) of varchar2(64); | ||
+ | | ||
+ | entities_search_for arr_string; | ||
+ | entities_replace arr_string; | ||
+ | cont number; | ||
+ | | ||
+ | begin | ||
+ | |||
+ | |||
+ | -- to accelerate the issue | ||
+ | if dirty is null then | ||
+ | | ||
+ | end if; -- isnull(dirty) | ||
+ | |||
+ | if length( dirty ) = 0 then | ||
+ | | ||
+ | end if; -- length(dirty) | ||
+ | |||
+ | entities_search_for := arr_string( | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | '& | ||
+ | |||
+ | entities_replace := arr_string( | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | '&', | ||
+ | '"', | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ',', | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | '<', | ||
+ | ' | ||
+ | '>', | ||
+ | '?', | ||
+ | ',', | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | '''', | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | '''', | ||
+ | ' ', | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | |||
+ | out := dirty; | ||
+ | |||
+ | -- replace what is enclosed between <xml> and </ | ||
+ | -- *? -> lazy star (catches the minimum possible) | ||
+ | out := regexp_replace(out, | ||
+ | -- clean what it is inside the style tags | ||
+ | out := regexp_replace(out, | ||
+ | |||
+ | if to_cvs = 2 then | ||
+ | -- sanitize (not clean) the html | ||
+ | |||
+ | -- clean the tag <? | ||
+ | out := regexp_replace(out, | ||
+ | -- clean the tags <img whatever> | ||
+ | out := regexp_replace(out, | ||
+ | -- clean comments | ||
+ | out := regexp_replace(out,'< | ||
+ | -- clean meta | ||
+ | out := regexp_replace(out,'< | ||
+ | -- clean link | ||
+ | out := regexp_replace(out,'< | ||
+ | -- clean DIV | ||
+ | out := regexp_replace(out,'</? | ||
+ | -- clean SPAN | ||
+ | out := regexp_replace(out,'</? | ||
+ | -- clean "class inside tags" | ||
+ | out := regexp_replace(out,' | ||
+ | -- clean " | ||
+ | out := regexp_replace(out,' | ||
+ | -- clean namespaces <o:p> </ | ||
+ | out := regexp_replace(out, | ||
+ | out := regexp_replace(out, | ||
+ | |||
+ | -- clean empty opening and closing tags: it has to be | ||
+ | -- passed twice or three times to clean things like this: | ||
+ | -- < | ||
+ | -- TWEAK: < | ||
+ | out := regexp_replace(out,'< | ||
+ | out := regexp_replace(out,'< | ||
+ | -- TWEAK: < | ||
+ | out := regexp_replace(out,'< | ||
+ | out := regexp_replace(out,'< | ||
+ | |||
+ | else | ||
+ | -- clean html | ||
+ | |||
+ | -- replace all the stuff that is similar to a carriage return | ||
+ | out := regexp_replace(out, | ||
+ | out := regexp_replace(out, | ||
+ | out := regexp_replace(out, | ||
+ | | ||
+ | -- replace all the remaining html stuff | ||
+ | out := regexp_replace(out,'< | ||
+ | | ||
+ | -- replace all the entities | ||
+ | for cont in 1..119 loop | ||
+ | out := replace( out, entities_search_for(cont), | ||
+ | end loop; | ||
+ | | ||
+ | -- cleaning for export to cvs | ||
+ | if to_cvs = 1 then | ||
+ | out := replace( out, chr(10), '' | ||
+ | out := replace( out, chr(13), '' | ||
+ | out := replace( out, chr(9), '' | ||
+ | out := replace( out, ';', | ||
+ | out := replace( out, '"', | ||
+ | end if; | ||
+ | |||
+ | |||
+ | end if; | ||
+ | |||
+ | | ||
+ | return(out); | ||
+ | end strip_html; | ||
+ | |||
+ | |||
+ | |||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||