tipue_search.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tipue Search
  4. ============
  5. A Pelican plugin to serialize generated HTML to JSON
  6. that can be used by jQuery plugin - Tipue Search.
  7. Copyright (c) Talha Mansoor
  8. """
  9. from __future__ import unicode_literals
  10. import os.path
  11. import json
  12. from bs4 import BeautifulSoup
  13. from codecs import open
  14. try:
  15. from urlparse import urljoin
  16. except ImportError:
  17. from urllib.parse import urljoin
  18. from pelican import signals
  19. class Tipue_Search_JSON_Generator(object):
  20. def __init__(self, context, settings, path, theme, output_path, *null):
  21. self.output_path = output_path
  22. self.context = context
  23. self.siteurl = settings.get('SITEURL')
  24. self.relative_urls = settings.get('RELATIVE_URLS')
  25. self.tpages = settings.get('TEMPLATE_PAGES')
  26. self.output_path = output_path
  27. self.json_nodes = []
  28. def create_json_node(self, page):
  29. if getattr(page, 'status', 'published') != 'published':
  30. return
  31. soup_title = BeautifulSoup(page.title.replace(' ', ' '), 'html.parser')
  32. page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^')
  33. soup_text = BeautifulSoup(page.content, 'html.parser')
  34. page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^')
  35. page_text = ' '.join(page_text.split())
  36. page_category = page.category.name if getattr(page, 'category', 'None') != 'None' else ''
  37. page_url = '.'
  38. if page.url:
  39. page_url = page.url if self.relative_urls else (self.siteurl + '/' + page.url)
  40. node = {'title': page_title,
  41. 'text': page_text,
  42. 'tags': page_category,
  43. 'loc': page_url}
  44. self.json_nodes.append(node)
  45. def create_tpage_node(self, srclink):
  46. with open(os.path.join(self.output_path, self.tpages[srclink]),
  47. encoding='utf-8') as srcfile:
  48. soup = BeautifulSoup(srcfile, 'html.parser')
  49. page_title = soup.title.string if soup.title is not None else ''
  50. page_text = soup.get_text()
  51. # Should set default category?
  52. page_category = ''
  53. page_url = urljoin(self.siteurl, self.tpages[srclink])
  54. node = {'title': page_title,
  55. 'text': page_text,
  56. 'tags': page_category,
  57. 'url': page_url}
  58. self.json_nodes.append(node)
  59. def generate_output(self, writer):
  60. # bisognerebbe cambiare usando questo coso
  61. # for p in self.context['PAGES']:
  62. # print 'U', p.url
  63. path = os.path.join(self.output_path, 'tipuesearch_content.json')
  64. pages = self.context['pages'] + self.context['articles']
  65. for article in self.context['articles']:
  66. pages += article.translations
  67. for srclink in self.tpages:
  68. self.create_tpage_node(srclink)
  69. for page in pages:
  70. self.create_json_node(page)
  71. root_node = {'pages': self.json_nodes}
  72. with open(path, 'w', encoding='utf-8') as fd:
  73. json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False)
  74. def get_generators(generators):
  75. return Tipue_Search_JSON_Generator
  76. def register():
  77. signals.get_generators.connect(get_generators)