oracle:fixhtml
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| oracle:fixhtml [2013/04/05 16:30] – rlunaro | oracle:fixhtml [2024/10/05 17:05] (current) – rlunaro | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Fix or Sanitize HTML ====== | ||
| + | |||
| + | Yes: I've found the silver bullet for those of you who are seeking for a function that clean html code or sanitize it, specially if it comes from a cut and paste operation from word. | ||
| + | |||
| + | To the point. This snippet: | ||
| + | |||
| + | < | ||
| + | select dirty, strip_html(dirty) from dual; | ||
| + | </ | ||
| + | |||
| + | Removes all the HTML tags from the html code. But this one: | ||
| + | |||
| + | < | ||
| + | select dirty, strip_html(dirty, | ||
| + | </ | ||
| + | |||
| + | Wipes out all the garbage who is in the html code, leaving it --more or less-- " | ||
| + | |||
| + | <code plsql> | ||
| + | |||
| + | | ||
| + | to_cvs in number default 0) | ||
| + | return clob is out clob ; | ||
| + | | ||
| + | type arr_string is varray (200) of varchar2(64); | ||
| + | | ||
| + | entities_search_for arr_string; | ||
| + | entities_replace arr_string; | ||
| + | cont number; | ||
| + | | ||
| + | begin | ||
| + | |||
| + | |||
| + | -- to accelerate the issue | ||
| + | if dirty is null then | ||
| + | | ||
| + | end if; -- isnull(dirty) | ||
| + | |||
| + | if length( dirty ) = 0 then | ||
| + | | ||
| + | end if; -- length(dirty) | ||
| + | |||
| + | entities_search_for := arr_string( | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | '& | ||
| + | |||
| + | entities_replace := arr_string( | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | '&', | ||
| + | '"', | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ',', | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | '<', | ||
| + | ' | ||
| + | '>', | ||
| + | '?', | ||
| + | ',', | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | '''', | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | '''', | ||
| + | ' ', | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | |||
| + | out := dirty; | ||
| + | |||
| + | -- replace what is enclosed between <xml> and </ | ||
| + | -- *? -> lazy star (catches the minimum possible) | ||
| + | out := regexp_replace(out, | ||
| + | -- clean what it is inside the style tags | ||
| + | out := regexp_replace(out, | ||
| + | |||
| + | if to_cvs = 2 then | ||
| + | -- sanitize (not clean) the html | ||
| + | |||
| + | -- clean the tag <? | ||
| + | out := regexp_replace(out, | ||
| + | -- clean the tags <img whatever> | ||
| + | out := regexp_replace(out, | ||
| + | -- clean comments | ||
| + | out := regexp_replace(out,'< | ||
| + | -- clean meta | ||
| + | out := regexp_replace(out,'< | ||
| + | -- clean link | ||
| + | out := regexp_replace(out,'< | ||
| + | -- clean DIV | ||
| + | out := regexp_replace(out,'</? | ||
| + | -- clean SPAN | ||
| + | out := regexp_replace(out,'</? | ||
| + | -- clean "class inside tags" | ||
| + | out := regexp_replace(out,' | ||
| + | -- clean " | ||
| + | out := regexp_replace(out,' | ||
| + | -- clean namespaces <o:p> </ | ||
| + | out := regexp_replace(out, | ||
| + | out := regexp_replace(out, | ||
| + | |||
| + | -- clean empty opening and closing tags: it has to be | ||
| + | -- passed twice or three times to clean things like this: | ||
| + | -- < | ||
| + | -- TWEAK: < | ||
| + | out := regexp_replace(out,'< | ||
| + | out := regexp_replace(out,'< | ||
| + | -- TWEAK: < | ||
| + | out := regexp_replace(out,'< | ||
| + | out := regexp_replace(out,'< | ||
| + | |||
| + | else | ||
| + | -- clean html | ||
| + | |||
| + | -- replace all the stuff that is similar to a carriage return | ||
| + | out := regexp_replace(out, | ||
| + | out := regexp_replace(out, | ||
| + | out := regexp_replace(out, | ||
| + | | ||
| + | -- replace all the remaining html stuff | ||
| + | out := regexp_replace(out,'< | ||
| + | | ||
| + | -- replace all the entities | ||
| + | for cont in 1..119 loop | ||
| + | out := replace( out, entities_search_for(cont), | ||
| + | end loop; | ||
| + | | ||
| + | -- cleaning for export to cvs | ||
| + | if to_cvs = 1 then | ||
| + | out := replace( out, chr(10), '' | ||
| + | out := replace( out, chr(13), '' | ||
| + | out := replace( out, chr(9), '' | ||
| + | out := replace( out, ';', | ||
| + | out := replace( out, '"', | ||
| + | end if; | ||
| + | |||
| + | |||
| + | end if; | ||
| + | |||
| + | | ||
| + | return(out); | ||
| + | end strip_html; | ||
| + | |||
| + | |||
| + | |||
| + | </ | ||
| + | |||
| + | |||
| + | |||
| + | |||
| + | |||
