to-markdown.js 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. /*
  2. * to-markdown - an HTML to Markdown converter
  3. *
  4. * Copyright 2011, Dom Christie
  5. * Licenced under the MIT licence
  6. *
  7. */
  8. if (typeof he !== 'object' && typeof require === 'function') {
  9. var he = require('he');
  10. }
  11. var toMarkdown = function(string) {
  12. var ELEMENTS = [
  13. {
  14. patterns: 'p',
  15. replacement: function(str, attrs, innerHTML) {
  16. return innerHTML ? '\n\n' + innerHTML + '\n' : '';
  17. }
  18. },
  19. {
  20. patterns: 'br',
  21. type: 'void',
  22. replacement: '\n'
  23. },
  24. {
  25. patterns: 'h([1-6])',
  26. replacement: function(str, hLevel, attrs, innerHTML) {
  27. var hPrefix = '';
  28. for(var i = 0; i < hLevel; i++) {
  29. hPrefix += '#';
  30. }
  31. return '\n\n' + hPrefix + ' ' + innerHTML + '\n';
  32. }
  33. },
  34. {
  35. patterns: 'hr',
  36. type: 'void',
  37. replacement: '\n\n* * *\n'
  38. },
  39. {
  40. patterns: 'a',
  41. replacement: function(str, attrs, innerHTML) {
  42. var href = attrs.match(attrRegExp('href')),
  43. title = attrs.match(attrRegExp('title'));
  44. return href ? '[' + innerHTML + ']' + '(' + href[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')' : str;
  45. }
  46. },
  47. {
  48. patterns: ['b', 'strong'],
  49. replacement: function(str, attrs, innerHTML) {
  50. return innerHTML ? '**' + innerHTML + '**' : '';
  51. }
  52. },
  53. {
  54. patterns: ['i', 'em'],
  55. replacement: function(str, attrs, innerHTML) {
  56. return innerHTML ? '_' + innerHTML + '_' : '';
  57. }
  58. },
  59. {
  60. patterns: 'code',
  61. replacement: function(str, attrs, innerHTML) {
  62. return innerHTML ? '`' + he.decode(innerHTML) + '`' : '';
  63. }
  64. },
  65. {
  66. patterns: 'img',
  67. type: 'void',
  68. replacement: function(str, attrs, innerHTML) {
  69. var src = attrs.match(attrRegExp('src')),
  70. alt = attrs.match(attrRegExp('alt')),
  71. title = attrs.match(attrRegExp('title'));
  72. return '![' + (alt && alt[1] ? alt[1] : '') + ']' + '(' + src[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')';
  73. }
  74. }
  75. ];
  76. for(var i = 0, len = ELEMENTS.length; i < len; i++) {
  77. if(typeof ELEMENTS[i].patterns === 'string') {
  78. string = replaceEls(string, { tag: ELEMENTS[i].patterns, replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type });
  79. }
  80. else {
  81. for(var j = 0, pLen = ELEMENTS[i].patterns.length; j < pLen; j++) {
  82. string = replaceEls(string, { tag: ELEMENTS[i].patterns[j], replacement: ELEMENTS[i].replacement, type: ELEMENTS[i].type });
  83. }
  84. }
  85. }
  86. function replaceEls(html, elProperties) {
  87. var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>',
  88. regex = new RegExp(pattern, 'gi'),
  89. markdown = '';
  90. if(typeof elProperties.replacement === 'string') {
  91. markdown = html.replace(regex, elProperties.replacement);
  92. }
  93. else {
  94. markdown = html.replace(regex, function(str, p1, p2, p3) {
  95. return elProperties.replacement.call(this, str, p1, p2, p3);
  96. });
  97. }
  98. return markdown;
  99. }
  100. function attrRegExp(attr) {
  101. return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i');
  102. }
  103. // Pre code blocks
  104. string = string.replace(/<pre\b[^>]*>`([\s\S]*?)`<\/pre>/gi, function(str, innerHTML) {
  105. var text = he.decode(innerHTML);
  106. text = text.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense)
  107. text = text.replace(/\n/g, '\n ');
  108. return '\n\n ' + text + '\n';
  109. });
  110. // Lists
  111. // Escape numbers that could trigger an ol
  112. // If there are more than three spaces before the code, it would be in a pre tag
  113. // Make sure we are escaping the period not matching any character
  114. string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. ');
  115. // Converts lists that have no child lists (of same type) first, then works its way up
  116. var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi;
  117. while(string.match(noChildrenRegex)) {
  118. string = string.replace(noChildrenRegex, function(str) {
  119. return replaceLists(str);
  120. });
  121. }
  122. function replaceLists(html) {
  123. html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) {
  124. var lis = innerHTML.split('</li>');
  125. lis.splice(lis.length - 1, 1);
  126. for(i = 0, len = lis.length; i < len; i++) {
  127. if(lis[i]) {
  128. var prefix = (listType === 'ol') ? (i + 1) + ". " : "* ";
  129. lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) {
  130. innerHTML = innerHTML.replace(/^\s+/, '');
  131. innerHTML = innerHTML.replace(/\n\n/g, '\n\n ');
  132. // indent nested lists
  133. innerHTML = innerHTML.replace(/\n([ ]*)+(\*|\d+\.) /g, '\n$1 $2 ');
  134. return prefix + innerHTML;
  135. });
  136. }
  137. }
  138. return lis.join('\n');
  139. });
  140. return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, '');
  141. }
  142. // Blockquotes
  143. var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi;
  144. while(string.match(deepest)) {
  145. string = string.replace(deepest, function(str) {
  146. return replaceBlockquotes(str);
  147. });
  148. }
  149. function replaceBlockquotes(html) {
  150. html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) {
  151. inner = inner.replace(/^\s+|\s+$/g, '');
  152. inner = cleanUp(inner);
  153. inner = inner.replace(/^/gm, '> ');
  154. inner = inner.replace(/^(>([ \t]{2,}>)+)/gm, '> >');
  155. return inner;
  156. });
  157. return html;
  158. }
  159. function cleanUp(string) {
  160. string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace
  161. string = string.replace(/\n\s+\n/g, '\n\n');
  162. string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2
  163. return string;
  164. }
  165. return cleanUp(string);
  166. };
  167. if (typeof exports === 'object') {
  168. exports.toMarkdown = toMarkdown;
  169. }