From adb96a3773086c20f2167d81696ebdfcdd186228 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 17 Jul 2015 15:19:59 +0100 Subject: [PATCH] Support HMTL to markdown translation. --- Webperl/Template.pm | 120 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/Webperl/Template.pm b/Webperl/Template.pm index b7f67da..ebfa987 100644 --- a/Webperl/Template.pm +++ b/Webperl/Template.pm @@ -120,13 +120,17 @@ # - `{V_[admin_email]}` is replaced by the site admin email address. package Webperl::Template; +use experimental qw(smartmatch); use POSIX qw(strftime); use Webperl::Utils qw(path_join superchomp); use Carp qw(longmess); +use HTML::WikiConverter; +use HTML::Entities; +use v5.12; use strict; -our ($errstr, $utfentities, $entities, $ords, @timescales); +our ($errstr, $utfentities, $entities, $entitymap, $ords, @timescales); BEGIN { $errstr = ''; @@ -150,6 +154,23 @@ BEGIN { '\x88' => '…', # 0x88 (133) is an ellisis }; + $entitymap = { '–' => '-', + '—' => '-', + '’' => "'", + '‘' => "'", + '“' => '"', + '”' => '"', + '…' => '...', + '>' => '>', + '<' => '<', + '&' => '&', + ' ' => ' ', + '​' => '', + '\xE2\x80\x93' => '-', + '\xE2\x80\x94' => '-', + '\xE2\x80\xA6' => '...' + }; + $ords = {1 => "st", 2 => "nd", 3 => "rd", @@ -208,6 +229,7 @@ sub new { "mailcmd" => '/usr/sbin/sendmail -t -f chris@starforge.co.uk',#pevesupport@cs.man.ac.uk', # Change -f as needed! "entities" => $entities, "utfentities" => $utfentities, + "entitymap" => $entitymap, "blockname" => 0, "usecache" => 1, "replacelimit" => 5, @@ -959,6 +981,102 @@ sub html_strip { } +## @method $ html_to_markdown($html, $images, $tmplnames, $extramode) +# Convert the specified html into markdown text. +# +# @param html The HTML to convert to markdown. +# @param images An optional reference to an array of images. +# @param tmplnames A reference to a hash containing template names. +# @param extramode If true, turn on support for Markdown Extra mode. +# @return The markdown version of the text. +sub html_to_markdown { + my $self = shift; + my $html = shift; + my $images = shift; + my $tmplnames = shift; + my $extramode = shift || 0; + + # Handle html entities that are going to break... + foreach my $entity (keys(%{$self -> {"entitymap"}})) { + $html =~ s/$entity/$self->{entitymap}->{$entity}/g; + } + + # Strip gravatar links + $html =~ s|]+src="https://gravatar.com/[^>]+> ||g; + + my $converter = new HTML::WikiConverter(dialect => 'Markdown', + link_style => 'inline', + image_tag_fallback => 0, + md_extra => $extramode, + encoding => 'utf8'); + my $body = $converter -> html2wiki($html); + + # Clean up html the converter misses consistently + $body =~ s||\n|g; + $body =~ s|>|>|g; + + # WikiConverter's markdown converter knows not about tables + $body =~ s{}{}g; + + # fix title links + $body =~ s|^(#+\s+)\[ \[.*?\]\(.*?\) \]<>|$1|mg; + $body =~ s|^(#+\s+)<>|$1|mg; + + # Strip anchors + $body =~ s|\t\{#.*?\}||g; + $body =~ s|\[([^\]]+)\]\(#.*?\)|$1|g; + + # Convert titles + $body =~ s|^(#+)\s+(.*?)$|_markdown_underline($1, $2)|gem; + + # Sometimes there are bizarre <>s left in the content, dunno why... + $body =~ s|<>||g; + + $body =~ s|\n\n+|\n\n|g; + + my $imglist = ""; + for(my $i = 0; $i < 3; ++$i) { + next unless($images -> [$i] -> {"location"}); + + $imglist .= $self -> load_template($tmplnames -> {"image"}, {"***url***" => $images -> [$i] -> {"location"}}); + } + + my $imageblock = $self -> load_template($tmplnames -> {"images"}, {"***images***" => $imglist}) + if($imglist); + + return $self -> load_template($tmplnames -> {"markdown"}, {"***text***" => $body, + "***images***" => $imageblock}); +} + + +## @sub private $ _markdown_underline($level, $title) +# Given a markdown underline level and matching title, generate a +# replacement that uses underscores rather than #, ##, ###, etc +# +# @param level The markdown '#'-style underline level +# @param title The text of the title +# @return A new string containing the title with underscores. +sub _markdown_underline { + my $level = shift; + my $title = shift; + my $type; + + given($level) { + when("#") { $type = "-="; } + when("##") { $type = "="; } + when("###") { $type = "-"; } + } + return $title if(!$type); + + # Build the underscores - may need additional trimming as 'type' can be multichar + my $underscore = $type x length($title); + $underscore = substr($underscore, 0, length($title)) + if(length($underscore) > length($title)); + + return $title."\n".$underscore; +} + + ## @method $ bytes_to_human($bytes, $long) # Convenience wrappper around humanise_bytes for backwards compatibility. #