diff options
Diffstat (limited to 'scripts/spider.py')
| -rw-r--r-- | scripts/spider.py | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/scripts/spider.py b/scripts/spider.py new file mode 100644 index 0000000..c73a1ff --- /dev/null +++ b/scripts/spider.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +import os +import sys +import urllib.parse +import urllib.robotparser +import html.parser +import json +import requests +import mimetypes +from collections import deque + +class LinkExtractor(html.parser.HTMLParser): + def __init__(self): + super().__init__() + self.links = [] + self.images = [] + self.scripts = [] + self.styles = [] + self.text_parts = [] + self.title = '' + self.meta = {} + self.in_title = False + + def handle_starttag(self, tag, attrs): + if tag == 'a': + for attr, value in attrs: + if attr == 'href': + self.links.append(value) + elif tag == 'img': + for attr, value in attrs: + if attr == 'src': + self.images.append(value) + elif tag == 'script': + for attr, value in attrs: + if attr == 'src': + self.scripts.append(value) + elif tag == 'link': + rel = None + href = None + for attr, value in attrs: + if attr == 'rel': + rel = value + elif attr == 'href': + href = value + if rel in ['stylesheet', 'icon'] and href: + self.styles.append(href) + elif tag == 'title': + self.in_title = True + elif tag == 'meta': + name = None + content = None + for attr, value in attrs: + if attr == 'name' or attr == 'property': + name = value + elif attr == 'content': + content = value + if name and content: + self.meta[name] = content + + def handle_endtag(self, tag): + if tag == 'title': + self.in_title = False + + def handle_data(self, data): + if self.in_title: + self.title += data + else: + self.text_parts.append(data.strip()) + +def download_asset(url, base_path, timeout=10): + try: + resp = requests.get(url, timeout=timeout) + if resp.status_code == 200: + content_type = resp.headers.get('content-type', '') + ext = mimetypes.guess_extension(content_type) or '.bin' + filename = os.path.basename(urllib.parse.urlparse(url).path) + if not filename: + filename = 'asset' + ext + elif not os.path.splitext(filename)[1]: + filename += ext + filepath = os.path.join(base_path, filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, 'wb') as f: + f.write(resp.content) + return filepath + except Exception as e: + print(f"Error downloading {url}: {e}") + return None + +def main(): + base_url = 'https://nixtamal.toast.al' + + # Check robots.txt + rp = urllib.robotparser.RobotFileParser() + rp.set_url(base_url + '/robots.txt') + try: + rp.read() + if not rp.can_fetch('*', base_url + '/'): + print("Crawling not allowed by robots.txt") + sys.exit(1) + except: + print("Could not read robots.txt, proceeding assuming allowed") + + # Create directories + os.makedirs('docs/archive', exist_ok=True) + os.makedirs('docs/archive/assets', exist_ok=True) + + visited = set() + queue = deque([base_url]) + pages_data = {} + + while queue: + url = queue.popleft() + if url in visited: + continue + visited.add(url) + print(f"Crawling: {url}") + + try: + resp = requests.get(url, timeout=10) + if resp.status_code != 200: + print(f"Skipping {url} with status {resp.status_code}") + continue + + content = resp.text + parser = LinkExtractor() + parser.feed(content) + + # Make links absolute + abs_links = [] + for link in parser.links: + abs_link = urllib.parse.urljoin(url, link) + if abs_link.startswith(base_url): + abs_links.append(abs_link) + if abs_link not in visited and abs_link not in queue: + queue.append(abs_link) + + # Download assets + assets = [] + for img in parser.images: + img_url = urllib.parse.urljoin(url, img) + if img_url.startswith(base_url): + path = download_asset(img_url, 'docs/archive/assets') + if path: + assets.append({'type': 'image', 'url': img_url, 'local_path': path}) + + for script in parser.scripts: + script_url = urllib.parse.urljoin(url, script) + if script_url.startswith(base_url): + path = download_asset(script_url, 'docs/archive/assets') + if path: + assets.append({'type': 'script', 'url': script_url, 'local_path': path}) + + for style in parser.styles: + style_url = urllib.parse.urljoin(url, style) + if style_url.startswith(base_url): + path = download_asset(style_url, 'docs/archive/assets') + if path: + assets.append({'type': 'style', 'url': style_url, 'local_path': path}) + + # Save page + path = urllib.parse.urlparse(url).path + if not path or path == '/': + filename = 'index.html' + else: + filename = path.strip('/').replace('/', '_') + '.html' + filepath = os.path.join('docs/archive', filename) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + # Collect data + pages_data[url] = { + 'title': parser.title, + 'meta': parser.meta, + 'text': ' '.join(parser.text_parts), + 'links': abs_links, + 'assets': assets, + 'local_file': filepath + } + + except Exception as e: + print(f"Error crawling {url}: {e}") + + # Save structure + with open('docs/archive/structure.json', 'w', encoding='utf-8') as f: + json.dump(pages_data, f, indent=2, ensure_ascii=False) + + print("Crawling complete. Data saved to docs/archive/") + +if __name__ == '__main__': + main()
\ No newline at end of file |
