proxy.py/examples/web_scraper.py

# -*- coding: utf-8 -*-
"""
    proxy.py
    ~~~~~~~~
    ⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
    Network monitoring, controls & Application development, testing, debugging.

    :copyright: (c) 2013-present by Abhinav Singh and contributors.
    :license: BSD, see LICENSE for more details.
"""
import time
import socket

from typing import Dict

from proxy.common.flag import FlagParser
from proxy.core.acceptor import Work, AcceptorPool
from proxy.common.types import Readables, Writables


class WebScraper(Work):
    """Demonstrates how to orchestrate a generic work acceptors and executors
    workflow using proxy.py core.

    By default, `WebScraper` expects to receive work from a file on disk.
    Each line in the file must be a URL to scrape.  Received URL is scrapped
    by the implementation in this class.

    After scrapping, results are published to the eventing core.  One or several
    result subscriber can then handle the result as necessary.  Currently, result
    subscribers consume the scrapped response and write discovered URL in the
    file on the disk.  This creates a feedback loop.  Allowing WebScraper to
    continue endlessly.

    NOTE: No loop detection is performed currently.

    NOTE: File descriptor need not point to a file on disk.
    Example, file descriptor can be a database connection.
    For simplicity, imagine a Redis server connection handling
    only PUBSUB protocol.
    """

    def get_events(self) -> Dict[socket.socket, int]:
        """Return sockets and events (read or write) that we are interested in."""
        return {}

    def handle_events(
            self,
            readables: Readables,
            writables: Writables,
    ) -> bool:
        """Handle readable and writable sockets.

        Return True to shutdown work."""
        return False


if __name__ == '__main__':
    with AcceptorPool(
        flags=FlagParser.initialize(
            port=12345,
            num_workers=1,
            threadless=True,
            keyfile='https-key.pem',
            certfile='https-signed-cert.pem',
        ),
        work_klass=WebScraper,
    ) as pool:
        while True:
            time.sleep(1)