webperl/HTMLValidator.pm

207 lines
7.3 KiB
Perl

## @file
# HTML validation and checking functions. This file contains functions to
# support the cleaning and checking of html using a combination of
# HTML::Scrubber to do first-stage cleaning, HTML::Tidy to clear up the
# content as needed, and the W3C validator via the WebService::Validator::HTML::W3C
# to ensure that the xhtml generated by HTML::Tidy is valid.
#
# @author Chris Page <chris@starforge.co.uk>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
package HTMLValidator;
require Exporter;
use Encode;
use HTML::Scrubber;
use HTML::Tidy;
use WebService::Validator::HTML::W3C;
use strict;
our @ISA = qw(Exporter);
our @EXPORT = qw(scrub_html tidy_html check_xhtml);
# =============================================================================
# HTML::Scrubber related code
# List of tags we are going to let through, lifted from the security
# discussion on http://wiki.moxiecode.com/index.php/TinyMCE:Security
# Several tags removed to make xhtml conformance easier and to remove
# deprecated and eyestabbery.
my @allow = ("a", "b", "blockquote", "br", "caption", "col", "colgroup", "comment",
"em", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "img", "li", "ol", "p",
"pre", "small", "span", "strong", "sub", "sup", "table", "tbody", "td",
"tfoot", "th", "thead", "tr", "tt", "ul");
# Explicit rules for allowed tags, required to provide per-tag tweaks to the filter.
my @rules = (
img => {
src => qr{^(?:http|https)://}i,
alt => 1,
style => 1,
width => 1,
height => 1,
'*' => 0,
},
a => {
href => qr{^(?:http|https)://}i,
name => 1,
'*' => 0,
},
table => {
cellspacing => 1,
cellpadding => 1,
style => 1,
class => 1,
'*' => 0,
},
td => {
colspan => 1,
rowspan => 1,
style => 1,
'*' => 0,
},
blockquote => {
cite => qr{^(?:http|https)://}i,
style => 1,
'*' => 0,
},
span => {
class => 1,
style => 1,
title => 1,
'*' => 0,
},
div => {
class => 1,
style => 1,
title => 1,
'*' => 0,
},
);
# Default ruleset applied when no explicit rule is found for a tag.
my @default = (
0 => # default rule, deny all tags
{
'href' => qr{^(?:http|https)://[-\w]+(?:\.[-\w]+)/}i, # Force basic URL forms
'src' => qr{^(?:http|https)://[-\w]+(?:\.[-\w]+)/}i, # Force basic URL forms
'style' => qr{^((?!expr|java|script|eval|\r|\n|\t).)*$}i, # kill godawful insane dynamic css shit (who the fuck thought this would be a good idea?)
'name' => 1,
'*' => 0, # default rule, deny all attributes
}
);
## @fn $ scrub_html($html)
# Remove dangerous/unwanted elements and attributes from a html document. This will
# use HTML::Scrubber to remove the elements and attributes from the specified html
# that could be used maliciously. There is still the potential for a clever attacker
# to craft a page that bypasses this, but that exists pretty much regardless once
# html input is permitted...
#
# @param html The string containing the html to clean up
# @return A string containing the scrubbed html.
sub scrub_html {
my $html = shift;
# Die immediately if there's a nul character in the string, that should never, ever be there.
die_log("HACK ATTEMPT", "Hack attempt detected. Sod off.")
if($html =~ /\0/);
# First, a new scrubber
my $scrubber = HTML::Scrubber -> new(allow => \@allow,
rules => \@rules,
default => \@default,
comment => 0,
process => 0);
# fix problems with the parser setup. This is hacky and nasty,
# but from CPAN's bug tracker, this appears to have been present for
# the past 3 years at least.
if(exists $scrubber -> {_p}) {
# Allow for <img />, <br/>, <p></p>, and so on
$scrubber -> {_p} -> empty_element_tags(1);
# Make sure that HTML::Parser doesn't scream about utf-8 from the form
$scrubber -> {_p} -> utf8_mode(1)
if($scrubber -> {_p} -> can('utf8_mode'));
}
# And throw the html through the scrubber
return $scrubber -> scrub($html);
}
# ==============================================================================
# HTML::Tidy related code
## @fn $ tidy_html($html, $options)
# Pass a chunk of html through htmltidy. This should produce well-formed xhtml
# that can be passed on to the validator to check.
#
# @param html The string containing html to tidy.
# @param options A reference to a hash containing options to pass to HTML::Tidy.
# @return The html generated by htmltidy.
sub tidy_html {
my $html = shift;
my $options = shift;
# Create a new tidy object
my $tidy = HTML::Tidy->new($options);
return $tidy -> clean($html);
}
# ==============================================================================
# WebService::Validator::HTML::W3C related code
## @fn @ check_xhtml($xhtml, $options)
# Check that the xhtml is valid by passing it through the W3C validator service.
# If this is unable to contact the validation service, it will return the reason,
# otherwise the number of errors will be returned (0 indicates that the xhtml
# passed validation with no errors)
#
# @param xhtml The xhtml to validate with the W3C validator
# @param options A hash containing options to pass to the validator module.
# Currently supports 'timeout' and 'uri'.
# @return The number of errors during validation (0 = valid), or a string
# from the validator module explaining why the validation bombed.
sub check_xhtml {
my $xhtml = shift;
my $options = shift;
return 0;
# Create a validator
my $validator = WebService::Validator::HTML::W3C -> new(http_timeout => $options -> {"timeout"},
validator_uri => $options -> {"uri"});
# Throw the xhtml at the validator
if($validator -> validate_markup(Encode::encode_utf8($xhtml))) {
# return 0 to indicate it is valid
return 0
if($validator -> is_valid());
my $errs = "";
foreach my $err (@{$validator -> errors}) {
$errs .= $err -> msg." at line ".$err -> line."<br/>";
}
# otherwise, the xhtml is not valid, so return the error count
return $validator -> num_errors().":$errs";
}
# Get here and the validation request fell over, return the 'oh shit' result...
return $validator -> validator_error();
}
1;