summaryrefslogtreecommitdiff
path: root/scripts/spider.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/spider.py')
-rw-r--r--scripts/spider.py192
1 files changed, 192 insertions, 0 deletions
diff --git a/scripts/spider.py b/scripts/spider.py
new file mode 100644
index 0000000..c73a1ff
--- /dev/null
+++ b/scripts/spider.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import urllib.parse
+import urllib.robotparser
+import html.parser
+import json
+import requests
+import mimetypes
+from collections import deque
+
+class LinkExtractor(html.parser.HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self.links = []
+ self.images = []
+ self.scripts = []
+ self.styles = []
+ self.text_parts = []
+ self.title = ''
+ self.meta = {}
+ self.in_title = False
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'a':
+ for attr, value in attrs:
+ if attr == 'href':
+ self.links.append(value)
+ elif tag == 'img':
+ for attr, value in attrs:
+ if attr == 'src':
+ self.images.append(value)
+ elif tag == 'script':
+ for attr, value in attrs:
+ if attr == 'src':
+ self.scripts.append(value)
+ elif tag == 'link':
+ rel = None
+ href = None
+ for attr, value in attrs:
+ if attr == 'rel':
+ rel = value
+ elif attr == 'href':
+ href = value
+ if rel in ['stylesheet', 'icon'] and href:
+ self.styles.append(href)
+ elif tag == 'title':
+ self.in_title = True
+ elif tag == 'meta':
+ name = None
+ content = None
+ for attr, value in attrs:
+ if attr == 'name' or attr == 'property':
+ name = value
+ elif attr == 'content':
+ content = value
+ if name and content:
+ self.meta[name] = content
+
+ def handle_endtag(self, tag):
+ if tag == 'title':
+ self.in_title = False
+
+ def handle_data(self, data):
+ if self.in_title:
+ self.title += data
+ else:
+ self.text_parts.append(data.strip())
+
+def download_asset(url, base_path, timeout=10):
+ try:
+ resp = requests.get(url, timeout=timeout)
+ if resp.status_code == 200:
+ content_type = resp.headers.get('content-type', '')
+ ext = mimetypes.guess_extension(content_type) or '.bin'
+ filename = os.path.basename(urllib.parse.urlparse(url).path)
+ if not filename:
+ filename = 'asset' + ext
+ elif not os.path.splitext(filename)[1]:
+ filename += ext
+ filepath = os.path.join(base_path, filename)
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
+ with open(filepath, 'wb') as f:
+ f.write(resp.content)
+ return filepath
+ except Exception as e:
+ print(f"Error downloading {url}: {e}")
+ return None
+
+def main():
+ base_url = 'https://nixtamal.toast.al'
+
+ # Check robots.txt
+ rp = urllib.robotparser.RobotFileParser()
+ rp.set_url(base_url + '/robots.txt')
+ try:
+ rp.read()
+ if not rp.can_fetch('*', base_url + '/'):
+ print("Crawling not allowed by robots.txt")
+ sys.exit(1)
+ except:
+ print("Could not read robots.txt, proceeding assuming allowed")
+
+ # Create directories
+ os.makedirs('docs/archive', exist_ok=True)
+ os.makedirs('docs/archive/assets', exist_ok=True)
+
+ visited = set()
+ queue = deque([base_url])
+ pages_data = {}
+
+ while queue:
+ url = queue.popleft()
+ if url in visited:
+ continue
+ visited.add(url)
+ print(f"Crawling: {url}")
+
+ try:
+ resp = requests.get(url, timeout=10)
+ if resp.status_code != 200:
+ print(f"Skipping {url} with status {resp.status_code}")
+ continue
+
+ content = resp.text
+ parser = LinkExtractor()
+ parser.feed(content)
+
+ # Make links absolute
+ abs_links = []
+ for link in parser.links:
+ abs_link = urllib.parse.urljoin(url, link)
+ if abs_link.startswith(base_url):
+ abs_links.append(abs_link)
+ if abs_link not in visited and abs_link not in queue:
+ queue.append(abs_link)
+
+ # Download assets
+ assets = []
+ for img in parser.images:
+ img_url = urllib.parse.urljoin(url, img)
+ if img_url.startswith(base_url):
+ path = download_asset(img_url, 'docs/archive/assets')
+ if path:
+ assets.append({'type': 'image', 'url': img_url, 'local_path': path})
+
+ for script in parser.scripts:
+ script_url = urllib.parse.urljoin(url, script)
+ if script_url.startswith(base_url):
+ path = download_asset(script_url, 'docs/archive/assets')
+ if path:
+ assets.append({'type': 'script', 'url': script_url, 'local_path': path})
+
+ for style in parser.styles:
+ style_url = urllib.parse.urljoin(url, style)
+ if style_url.startswith(base_url):
+ path = download_asset(style_url, 'docs/archive/assets')
+ if path:
+ assets.append({'type': 'style', 'url': style_url, 'local_path': path})
+
+ # Save page
+ path = urllib.parse.urlparse(url).path
+ if not path or path == '/':
+ filename = 'index.html'
+ else:
+ filename = path.strip('/').replace('/', '_') + '.html'
+ filepath = os.path.join('docs/archive', filename)
+ with open(filepath, 'w', encoding='utf-8') as f:
+ f.write(content)
+
+ # Collect data
+ pages_data[url] = {
+ 'title': parser.title,
+ 'meta': parser.meta,
+ 'text': ' '.join(parser.text_parts),
+ 'links': abs_links,
+ 'assets': assets,
+ 'local_file': filepath
+ }
+
+ except Exception as e:
+ print(f"Error crawling {url}: {e}")
+
+ # Save structure
+ with open('docs/archive/structure.json', 'w', encoding='utf-8') as f:
+ json.dump(pages_data, f, indent=2, ensure_ascii=False)
+
+ print("Crawling complete. Data saved to docs/archive/")
+
+if __name__ == '__main__':
+ main() \ No newline at end of file