#
# Example hooks file for urlwatch
#
# Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#


# You can decide which filter you want to apply using the "url"
# parameter and you can use the "re" module to search for the
# content that you want to filter, so the noise is removed.


# Needed for regular expression substitutions
import re

# Additional modules installed with urlwatch
from urlwatch import ical2txt
from urlwatch import html2txt


def filter(url, data):
    if url == 'http://www.inso.tuwien.ac.at/lectures/usability/':
        return re.sub('.*TYPO3SEARCH_end.*', '', data)
    elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
        return re.sub('</html><!-- \d+ -->', '', data)
    elif url == 'http://grenzlandvagab.gr.funpic.de/events/':
        return re.sub('<!-- Ad by .*by funpic.de -->', '', data)
    elif url == 'http://www.mv-eberau.at/terminliste.php':
        return data.replace('</br>', '\n')
    elif 'iuner.lukas-krispel.at' in url:
        # Remove always-changing entries from FTP server listing
        return re.sub('drwx.*usage', '', re.sub('drwx.*logs', '', data))
    elif url.startswith('http://ti.tuwien.ac.at/rts/teaching/courses/'):
        # example of using the "tidy" module for cleaning up bad HTML
        import tidy
        mlr = re.compile('magicCalendarHeader.*magicCalendarBottom', re.S)
        data = str(tidy.parseString(data, output_xhtml=1, indent=0, tidy_mark=0))
        return re.sub(mlr, '', data)
    elif url == 'http://www.poleros.at/calender.htm':
        # remove style changes, because we only want to see content changes
        return re.sub('style="[^"]"', '', data)
    elif url == 'http://www.ads.tuwien.ac.at/teaching/LVA/186170.html':
        return re.sub('Saved in parser cache with key .* and timestamp .* --', '', re.sub('Served by aragon in .* secs\.', '', re.sub('This page has been accessed .* times\.', '', data)))
    elif url.endswith('.ics') or url == 'http://www.kukuk.at/ical/events':
        # example of generating a summary for icalendar files
        # append "data" to the converted ical data, so you get
        # all minor changes to the ICS that are not included
        # in the ical2text summary (remove this if you want)
        return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data
    elif url == 'http://www.oho.at/programm/programm.php3':
        # example of converting HTML to plaintext for very
        # ugly HTML code that cannot be deciphered when just
        # diffing the HTML source (or if the user is just not
        # used to HTML, use this for every web page)
        #
        # You need to install "lynx" for this to work or use
        # "html2text" as method (needs "html2text") or use
        # "re" (does not need anything, but only strips tags
        # using a regular expression and does no formatting)
        return html2txt.html2text(data, method='lynx')

    # The next line is optional - if the filter function returns
    # None (or no value at all), the input data will be taken as
    # the result -> None as return value means "don't filter".
    return data

