3 namespace League\HTMLToMarkdown;
8 * A helper class to convert HTML to Markdown.
10 * @author Colin O'Dell <colinodell@gmail.com>
11 * @author Nick Cernis <nick@cern.is>
13 * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
15 * @license http://www.opensource.org/licenses/mit-license.php MIT
22 protected $environment;
27 * @param Environment|array $options Environment object or configuration options
29 public function __construct($options = array())
31 if ($options instanceof Environment) {
32 $this->environment = $options;
33 } elseif (is_array($options)) {
35 'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
36 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
37 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
38 'bold_style' => '**', // Set to '__' if you prefer the underlined style
39 'italic_style' => '_', // Set to '*' if you prefer the asterisk style
40 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
41 'hard_break' => false,// Set to true to turn <br> into `\n` instead of ` \n`
44 $this->environment = Environment::createDefaultEnvironment($defaults);
46 $this->environment->getConfig()->merge($options);
53 public function getEnvironment()
55 return $this->environment;
59 * @return Configuration
61 public function getConfig()
63 return $this->environment->getConfig();
69 * @see HtmlConverter::convert
73 * @return string The Markdown version of the html
75 public function __invoke($html)
77 return $this->convert($html);
83 * Loads HTML and passes to getMarkdown()
87 * @throws \InvalidArgumentException
89 * @return string The Markdown version of the html
91 public function convert($html)
93 if (trim($html) === '') {
97 $document = $this->createDOMDocument($html);
99 // Work on the entire DOM tree (including head and body)
100 if (!($root = $document->getElementsByTagName('html')->item(0))) {
101 throw new \InvalidArgumentException('Invalid HTML was provided');
104 $rootElement = new Element($root);
105 $this->convertChildren($rootElement);
107 // Store the now-modified DOMDocument as a string
108 $markdown = $document->saveHTML();
110 return $this->sanitize($markdown);
114 * @param string $html
116 * @return \DOMDocument
118 private function createDOMDocument($html)
120 $document = new \DOMDocument();
122 if ($this->getConfig()->getOption('suppress_errors')) {
123 // Suppress conversion errors (from http://bit.ly/pCCRSX)
124 libxml_use_internal_errors(true);
127 // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
128 $document->loadHTML('<?xml encoding="UTF-8">' . $html);
129 $document->encoding = 'UTF-8';
131 if ($this->getConfig()->getOption('suppress_errors')) {
132 libxml_clear_errors();
141 * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
143 * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
144 * starting with the innermost element and working up to the outermost element.
146 * @param ElementInterface $element
148 private function convertChildren(ElementInterface $element)
150 // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
151 // except if the current node is a code tag, which needs to be converted by the CodeConverter.
152 if ($element->isDescendantOf(array('pre', 'code')) && $element->getTagName() !== 'code') {
156 // If the node has children, convert those to Markdown first
157 if ($element->hasChildren()) {
158 foreach ($element->getChildren() as $child) {
159 $this->convertChildren($child);
163 // Now that child nodes have been converted, convert the original node
164 $markdown = $this->convertToMarkdown($element);
166 // Create a DOM text node containing the Markdown equivalent of the original node
168 // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
169 $element->setFinalMarkdown($markdown);
173 * Convert to Markdown
175 * Converts an individual node into a #text node containing a string of its Markdown equivalent.
177 * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
179 * @param ElementInterface $element
181 * @return string The converted HTML as Markdown
183 protected function convertToMarkdown(ElementInterface $element)
185 $tag = $element->getTagName();
187 // Strip nodes named in remove_nodes
188 $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
189 if (in_array($tag, $tags_to_remove)) {
193 $converter = $this->environment->getConverterByTag($tag);
195 return $converter->convert($element);
199 * @param string $markdown
203 protected function sanitize($markdown)
205 $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
206 $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
207 $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
210 * Removing unwanted tags. Tags should be added to the array in the order they are expected.
211 * XML, html and body opening tags should be in that order. Same case with closing tags
213 $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '
');
215 foreach ($unwanted as $tag) {
216 if (strpos($tag, '/') === false) {
218 if (strpos($markdown, $tag) === 0) {
219 $markdown = substr($markdown, strlen($tag));
223 if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
224 $markdown = substr($markdown, 0, -strlen($tag));
229 return trim($markdown, "\n\r\0\x0B");