#!/usr/bin/env python3 import os import sys import urllib.parse import urllib.robotparser import html.parser import json import requests import mimetypes from collections import deque class LinkExtractor(html.parser.HTMLParser): def __init__(self): super().__init__() self.links = [] self.images = [] self.scripts = [] self.styles = [] self.text_parts = [] self.title = '' self.meta = {} self.in_title = False def handle_starttag(self, tag, attrs): if tag == 'a': for attr, value in attrs: if attr == 'href': self.links.append(value) elif tag == 'img': for attr, value in attrs: if attr == 'src': self.images.append(value) elif tag == 'script': for attr, value in attrs: if attr == 'src': self.scripts.append(value) elif tag == 'link': rel = None href = None for attr, value in attrs: if attr == 'rel': rel = value elif attr == 'href': href = value if rel in ['stylesheet', 'icon'] and href: self.styles.append(href) elif tag == 'title': self.in_title = True elif tag == 'meta': name = None content = None for attr, value in attrs: if attr == 'name' or attr == 'property': name = value elif attr == 'content': content = value if name and content: self.meta[name] = content def handle_endtag(self, tag): if tag == 'title': self.in_title = False def handle_data(self, data): if self.in_title: self.title += data else: self.text_parts.append(data.strip()) def download_asset(url, base_path, timeout=10): try: resp = requests.get(url, timeout=timeout) if resp.status_code == 200: content_type = resp.headers.get('content-type', '') ext = mimetypes.guess_extension(content_type) or '.bin' filename = os.path.basename(urllib.parse.urlparse(url).path) if not filename: filename = 'asset' + ext elif not os.path.splitext(filename)[1]: filename += ext filepath = os.path.join(base_path, filename) os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'wb') as f: f.write(resp.content) return filepath except Exception as e: print(f"Error downloading {url}: {e}") return None def main(): base_url = 'https://nixtamal.toast.al' # Check robots.txt rp = urllib.robotparser.RobotFileParser() rp.set_url(base_url + '/robots.txt') try: rp.read() if not rp.can_fetch('*', base_url + '/'): print("Crawling not allowed by robots.txt") sys.exit(1) except: print("Could not read robots.txt, proceeding assuming allowed") # Create directories os.makedirs('docs/archive', exist_ok=True) os.makedirs('docs/archive/assets', exist_ok=True) visited = set() queue = deque([base_url]) pages_data = {} while queue: url = queue.popleft() if url in visited: continue visited.add(url) print(f"Crawling: {url}") try: resp = requests.get(url, timeout=10) if resp.status_code != 200: print(f"Skipping {url} with status {resp.status_code}") continue content = resp.text parser = LinkExtractor() parser.feed(content) # Make links absolute abs_links = [] for link in parser.links: abs_link = urllib.parse.urljoin(url, link) if abs_link.startswith(base_url): abs_links.append(abs_link) if abs_link not in visited and abs_link not in queue: queue.append(abs_link) # Download assets assets = [] for img in parser.images: img_url = urllib.parse.urljoin(url, img) if img_url.startswith(base_url): path = download_asset(img_url, 'docs/archive/assets') if path: assets.append({'type': 'image', 'url': img_url, 'local_path': path}) for script in parser.scripts: script_url = urllib.parse.urljoin(url, script) if script_url.startswith(base_url): path = download_asset(script_url, 'docs/archive/assets') if path: assets.append({'type': 'script', 'url': script_url, 'local_path': path}) for style in parser.styles: style_url = urllib.parse.urljoin(url, style) if style_url.startswith(base_url): path = download_asset(style_url, 'docs/archive/assets') if path: assets.append({'type': 'style', 'url': style_url, 'local_path': path}) # Save page path = urllib.parse.urlparse(url).path if not path or path == '/': filename = 'index.html' else: filename = path.strip('/').replace('/', '_') + '.html' filepath = os.path.join('docs/archive', filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(content) # Collect data pages_data[url] = { 'title': parser.title, 'meta': parser.meta, 'text': ' '.join(parser.text_parts), 'links': abs_links, 'assets': assets, 'local_file': filepath } except Exception as e: print(f"Error crawling {url}: {e}") # Save structure with open('docs/archive/structure.json', 'w', encoding='utf-8') as f: json.dump(pages_data, f, indent=2, ensure_ascii=False) print("Crawling complete. Data saved to docs/archive/") if __name__ == '__main__': main()