proxy.py/examples/web_scraper.py

71 lines
2.2 KiB
Python

# -*- coding: utf-8 -*-
"""
proxy.py
~~~~~~~~
⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
Network monitoring, controls & Application development, testing, debugging.
:copyright: (c) 2013-present by Abhinav Singh and contributors.
:license: BSD, see LICENSE for more details.
"""
import time
import socket
from typing import Dict
from proxy.common.flag import FlagParser
from proxy.core.acceptor import Work, AcceptorPool
from proxy.common.types import Readables, Writables
class WebScraper(Work):
"""Demonstrates how to orchestrate a generic work acceptors and executors
workflow using proxy.py core.
By default, `WebScraper` expects to receive work from a file on disk.
Each line in the file must be a URL to scrape. Received URL is scrapped
by the implementation in this class.
After scrapping, results are published to the eventing core. One or several
result subscriber can then handle the result as necessary. Currently, result
subscribers consume the scrapped response and write discovered URL in the
file on the disk. This creates a feedback loop. Allowing WebScraper to
continue endlessly.
NOTE: No loop detection is performed currently.
NOTE: File descriptor need not point to a file on disk.
Example, file descriptor can be a database connection.
For simplicity, imagine a Redis server connection handling
only PUBSUB protocol.
"""
def get_events(self) -> Dict[socket.socket, int]:
"""Return sockets and events (read or write) that we are interested in."""
return {}
def handle_events(
self,
readables: Readables,
writables: Writables,
) -> bool:
"""Handle readable and writable sockets.
Return True to shutdown work."""
return False
if __name__ == '__main__':
with AcceptorPool(
flags=FlagParser.initialize(
port=12345,
num_workers=1,
threadless=True,
keyfile='https-key.pem',
certfile='https-signed-cert.pem',
),
work_klass=WebScraper,
) as pool:
while True:
time.sleep(1)