| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | """ |
|---|
| 3 | zine.parsers |
|---|
| 4 | ~~~~~~~~~~~~ |
|---|
| 5 | |
|---|
| 6 | This module holds the base parser information and the dict of |
|---|
| 7 | default parsers. |
|---|
| 8 | |
|---|
| 9 | :copyright: (c) 2010 by the Zine Team, see AUTHORS for more details. |
|---|
| 10 | :license: BSD, see LICENSE for more details. |
|---|
| 11 | """ |
|---|
| 12 | from zine.i18n import lazy_gettext |
|---|
| 13 | from zine.application import iter_listeners, get_application |
|---|
| 14 | from zine.utils.zeml import parse_html, parse_zeml, sanitize, split_intro, \ |
|---|
| 15 | Element, RootElement |
|---|
| 16 | from zine.utils.xml import replace_entities |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | def parse(input_data, parser=None, reason='unknown'): |
|---|
| 20 | """Generate a doc tree out of the data provided. If we are not in unbound |
|---|
| 21 | mode the `process-doc-tree` event is sent so that plugins can modify |
|---|
| 22 | the tree in place. The reason is useful for plugins to find out if they |
|---|
| 23 | want to render it or now. For example a normal blog post would have the |
|---|
| 24 | reason 'post', a comment 'comment', an isolated page from a plugin maybe |
|---|
| 25 | 'page' etc. |
|---|
| 26 | """ |
|---|
| 27 | input_data = u'\n'.join(input_data.splitlines()) |
|---|
| 28 | app = get_application() |
|---|
| 29 | if parser is None: |
|---|
| 30 | try: |
|---|
| 31 | parser = app.parsers[app.cfg['default_parser']] |
|---|
| 32 | except KeyError: |
|---|
| 33 | # the plugin that provided the default parser is not |
|---|
| 34 | # longer available. reset the config value to the builtin |
|---|
| 35 | # parser and parse afterwards. |
|---|
| 36 | t = app.cfg.edit() |
|---|
| 37 | t.revert_to_default('default_parser') |
|---|
| 38 | t.commit() |
|---|
| 39 | parser = app.parsers[app.cfg['default_parser']] |
|---|
| 40 | else: |
|---|
| 41 | try: |
|---|
| 42 | parser = app.parsers[parser] |
|---|
| 43 | except KeyError: |
|---|
| 44 | raise ValueError('parser %r does not exist' % (parser,)) |
|---|
| 45 | |
|---|
| 46 | tree = parser.parse(input_data, reason) |
|---|
| 47 | |
|---|
| 48 | #! allow plugins to alter the doctree. |
|---|
| 49 | for callback in iter_listeners('process-doc-tree'): |
|---|
| 50 | item = callback(tree, input_data, reason) |
|---|
| 51 | if item is not None: |
|---|
| 52 | tree = item |
|---|
| 53 | |
|---|
| 54 | return tree |
|---|
| 55 | |
|---|
| 56 | |
|---|
| 57 | def render_preview(text, parser, component='post'): |
|---|
| 58 | """Renders a preview text for the given text using the parser |
|---|
| 59 | provided. |
|---|
| 60 | """ |
|---|
| 61 | tree = parse(text, parser, '%s-preview' % component) |
|---|
| 62 | intro, body = split_intro(tree) |
|---|
| 63 | if intro: |
|---|
| 64 | return u'<div class="intro">%s</div>%s' % (intro.to_html(), |
|---|
| 65 | body.to_html()) |
|---|
| 66 | return body.to_html() |
|---|
| 67 | |
|---|
| 68 | |
|---|
| 69 | class MarkupExtension(object): |
|---|
| 70 | """Handler for a markup language-agnostic markup extension. |
|---|
| 71 | |
|---|
| 72 | The following attributes must/can be set on subclasses: |
|---|
| 73 | |
|---|
| 74 | `name` |
|---|
| 75 | The name under which the extension is accessible. This is the tag |
|---|
| 76 | name for XML-like markup languages, or the directive name for |
|---|
| 77 | reStructuredText (reST), etc. |
|---|
| 78 | `is_block_level` |
|---|
| 79 | True if the element is to be rendered as a block-level element. |
|---|
| 80 | This may also change how the element is accessed; for example, in |
|---|
| 81 | reST, inline elements are used as roles, while block-level elements |
|---|
| 82 | are used as directives. |
|---|
| 83 | `is_void` |
|---|
| 84 | True if the element doesn't have content. |
|---|
| 85 | `is_isolated` |
|---|
| 86 | True if the element's contents should not be parsed by the markup |
|---|
| 87 | parser and converted to a ZEML tree. |
|---|
| 88 | `broken_by` |
|---|
| 89 | A sequence of element names by which this element is implicitly |
|---|
| 90 | closed. Applies only to XML-like markup languages. |
|---|
| 91 | `attributes` |
|---|
| 92 | A set of allowed attribute (option) names. Note that inline elements |
|---|
| 93 | may not support attributes in all markup languages. |
|---|
| 94 | `argument_attribute` |
|---|
| 95 | For markup languages that support arguments to elements as well |
|---|
| 96 | as attributes, if this is the name of an attribute given in |
|---|
| 97 | `attributes`, the element will accept one argument and map it |
|---|
| 98 | to the given attribute. Note that inline elements may not support |
|---|
| 99 | arguments in all markup languages. |
|---|
| 100 | |
|---|
| 101 | The `process` method must be overwritten. It is given three arguments: |
|---|
| 102 | |
|---|
| 103 | `attributes` |
|---|
| 104 | A dictionary of attributes (options) of the markup element. |
|---|
| 105 | `content` |
|---|
| 106 | The content of the element; if `is_isolated` is False, this has |
|---|
| 107 | already been parsed with the markup parser and is a ZEML tree, |
|---|
| 108 | otherwise it is raw text. |
|---|
| 109 | `reason` |
|---|
| 110 | The parsing reason -- either "post", "comment", "post-preview", |
|---|
| 111 | "comment-preview", or "system". The element can change behavior |
|---|
| 112 | depending on the reason, for example disable potentially unsafe |
|---|
| 113 | features for comments. |
|---|
| 114 | |
|---|
| 115 | It must return a ZEML tree. |
|---|
| 116 | """ |
|---|
| 117 | |
|---|
| 118 | name = None |
|---|
| 119 | is_void = False |
|---|
| 120 | is_isolated = False |
|---|
| 121 | is_block_level = True |
|---|
| 122 | broken_by = None |
|---|
| 123 | attributes = set() |
|---|
| 124 | argument_attribute = None |
|---|
| 125 | |
|---|
| 126 | def __init__(self, app): |
|---|
| 127 | self.app = app |
|---|
| 128 | |
|---|
| 129 | def process(self, attributes, content, reason): |
|---|
| 130 | """Called each time the element is encountered.""" |
|---|
| 131 | raise NotImplementedError() |
|---|
| 132 | |
|---|
| 133 | |
|---|
| 134 | class BaseParser(object): |
|---|
| 135 | """Baseclass for all kinds of parsers.""" |
|---|
| 136 | |
|---|
| 137 | #: the localized name of the parser. |
|---|
| 138 | name = None |
|---|
| 139 | |
|---|
| 140 | def __init__(self, app): |
|---|
| 141 | self.app = app |
|---|
| 142 | |
|---|
| 143 | def parse(self, input_data, reason): |
|---|
| 144 | """Return a ZEML tree.""" |
|---|
| 145 | raise NotImplementedError() |
|---|
| 146 | |
|---|
| 147 | |
|---|
| 148 | class ZEMLParser(BaseParser): |
|---|
| 149 | """The parser for the ZEML Markup language.""" |
|---|
| 150 | |
|---|
| 151 | name = lazy_gettext('Zine-Markup') |
|---|
| 152 | |
|---|
| 153 | def parse(self, input_data, reason): |
|---|
| 154 | rv = parse_zeml(input_data, reason, self.app.markup_extensions) |
|---|
| 155 | if reason == 'comment': |
|---|
| 156 | rv = sanitize(rv) |
|---|
| 157 | return rv |
|---|
| 158 | |
|---|
| 159 | |
|---|
| 160 | class HTMLParser(BaseParser): |
|---|
| 161 | """A parser that understands plain old HTML.""" |
|---|
| 162 | |
|---|
| 163 | name = lazy_gettext('HTML') |
|---|
| 164 | |
|---|
| 165 | def parse(self, input_data, reason): |
|---|
| 166 | rv = parse_html(input_data) |
|---|
| 167 | if reason == 'comment': |
|---|
| 168 | rv = sanitize(rv) |
|---|
| 169 | return rv |
|---|
| 170 | |
|---|
| 171 | |
|---|
| 172 | class PlainTextParser(BaseParser): |
|---|
| 173 | """Parses simple text into a ZEML tree by utilizing pottymouth.""" |
|---|
| 174 | |
|---|
| 175 | name = lazy_gettext('Text') |
|---|
| 176 | |
|---|
| 177 | def _to_text(self, token): |
|---|
| 178 | """Convert a token to normal text.""" |
|---|
| 179 | return replace_entities(unicode(token)) |
|---|
| 180 | |
|---|
| 181 | def _to_zeml(self, node, untrusted=False): |
|---|
| 182 | """Convert a potty-mouth node into a ZEML tree.""" |
|---|
| 183 | from zine._ext.pottymouth import Token |
|---|
| 184 | def add_text(node, text): |
|---|
| 185 | if node.children: |
|---|
| 186 | node.children[-1].tail += text |
|---|
| 187 | else: |
|---|
| 188 | node.text += text |
|---|
| 189 | |
|---|
| 190 | def convert(node, is_root): |
|---|
| 191 | if is_root: |
|---|
| 192 | result = RootElement() |
|---|
| 193 | else: |
|---|
| 194 | result = Element(node.name) |
|---|
| 195 | if node._attributes: |
|---|
| 196 | result.attributes.update(node._attributes) |
|---|
| 197 | |
|---|
| 198 | for item in node: |
|---|
| 199 | if isinstance(item, (str, unicode, Token)): |
|---|
| 200 | add_text(result, self._to_text(item)) |
|---|
| 201 | else: |
|---|
| 202 | child = convert(item, False) |
|---|
| 203 | # remove the useless empty spans |
|---|
| 204 | if child.name == 'span' and not child.attributes: |
|---|
| 205 | add_text(result, child.text) |
|---|
| 206 | result.children.extend(child.children) |
|---|
| 207 | add_text(result, child.tail) |
|---|
| 208 | else: |
|---|
| 209 | result.children.append(child) |
|---|
| 210 | |
|---|
| 211 | # fixes an output bug from pottymouth |
|---|
| 212 | if len(result.children) == 1 and node.name == 'p' and \ |
|---|
| 213 | result.children[0].name == 'blockquote': |
|---|
| 214 | result = result.children[0] |
|---|
| 215 | |
|---|
| 216 | # untrusted posts get nofollow on links |
|---|
| 217 | if untrusted and result.name == 'a': |
|---|
| 218 | result.attributes['rel'] = 'nofollow' |
|---|
| 219 | |
|---|
| 220 | return result |
|---|
| 221 | return convert(node, True) |
|---|
| 222 | |
|---|
| 223 | def parse(self, input_data, reason): |
|---|
| 224 | from zine._ext.pottymouth import PottyMouth |
|---|
| 225 | parser = PottyMouth(emdash=False, ellipsis=False, smart_quotes=False, |
|---|
| 226 | youtube=False, image=False, italic=False, |
|---|
| 227 | all_links=not self.app.cfg['plaintext_parser_nolinks']) |
|---|
| 228 | node = parser.parse(input_data) |
|---|
| 229 | return self._to_zeml(node, reason == 'comment') |
|---|
| 230 | |
|---|
| 231 | |
|---|
| 232 | all_parsers = { |
|---|
| 233 | 'zeml': ZEMLParser, |
|---|
| 234 | 'html': HTMLParser, |
|---|
| 235 | 'text': PlainTextParser |
|---|
| 236 | } |
|---|