started ex_4

2026-01-10 00:54:48 +01:00
parent ec4234ee13
commit a12ad2c0e4
16 changed files with 39013 additions and 0 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -53,6 +53,15 @@
            "preLaunchTask": "build-task5",
            "console": "integratedTerminal"
        },
+        {
+            "type": "java",
+            "name": "Run Crawler (Week 4)",
+            "request": "launch",
+            "cwd": "${workspaceFolder}/week4_TinsaeGhilay/crawler",
+            "projectName": "crawler",
+            "mainClass": "com.tinsae.crawler.Main",
+            "console": "integratedTerminal"
+        },
        {
            "type": "java",
            "name": "Run HelloWorldServer (Task 7/example)",
--- a/week3_TinsaeGhilay.zip
+++ b/week3_TinsaeGhilay.zip
--- a/week4_TinsaeGhilay/crawler/crawled_content/crawled_urls_1768002752389.txt
+++ b/week4_TinsaeGhilay/crawler/crawled_content/crawled_urls_1768002752389.txt
--- a/week4_TinsaeGhilay/crawler/pom.xml
+++ b/week4_TinsaeGhilay/crawler/pom.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.tinsae.crawler</groupId>
+    <artifactId>crawler</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+    </properties>
+
+    <dependencies>
+        <!-- JSoup for HTML parsing -->
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.15.3</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>exec-maven-plugin</artifactId>
+                <version>3.1.0</version>
+                <configuration>
+                    <mainClass>com.tinsae.crawler.Main</mainClass>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
@@ -0,0 +1,160 @@
+package com.tinsae.crawler;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * CrawlTask represents a single URL crawl operation
+ * Runs in a separate thread, fetches content, extracts links,
+ * appends URLs to file, and submits new tasks for discovered links.
+ */
+public class CrawlTask implements Runnable {
+
+    private final String url;
+    private final int depth;
+    private final Crawler crawler;
+    private static final String urlsFilename = "crawled_urls_" + System.currentTimeMillis() + ".txt";
+
+    public CrawlTask(String url, int depth, Crawler crawler) {
+        this.url = url;
+        this.depth = depth;
+        this.crawler = crawler;
+    }
+
+    public String getUrl() {
+        return url;
+    }
+
+    public int getDepth() {
+        return depth;
+    }
+
+    @Override
+    public void run() {
+        System.out.println("[" + Thread.currentThread().getName() + "] Crawling: " + url + " (depth: " + depth + ")");
+
+        try {
+            // Fetch and parse the page
+            Document document = fetchPage(url);
+            if (document == null) {
+                System.err.println("[" + Thread.currentThread().getName() + "] Failed to fetch: " + url);
+                return;
+            }
+
+            // Extract links if we haven't reached max depth
+            if (depth < crawler.getMaxDepth()) {
+                Set<String> links = extractLinks(document, url);
+                System.out.println(
+                        "[" + Thread.currentThread().getName() + "] Found " + links.size() + " links on " + url);
+
+                // Save URLs to file
+                if (!links.isEmpty()) {
+                    saveUrlsToFile(links, url, depth);
+                }
+
+                // Submit tasks for each discovered link
+                for (String link : links) {
+                    CrawlTask newTask = new CrawlTask(link, depth + 1, crawler);
+                    crawler.submitTask(newTask);
+                }
+            } else {
+                System.out.println("[" + Thread.currentThread().getName() + "] Max depth reached for: " + url);
+            }
+
+        } catch (Exception e) {
+            System.err.println(
+                    "[" + Thread.currentThread().getName() + "] Error crawling " + url + ": " + e.getMessage());
+        }
+    }
+
+    /**
+     * Fetch a web page and return as JSoup Document
+     */
+    private Document fetchPage(String url) throws IOException {
+        // Add timeout and user agent for responsible crawling
+        return Jsoup.connect(url)
+                .timeout(10000) // 10 second timeout
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
+                .get();
+    }
+
+    /**
+     * Extract all links from a page that are within the same domain
+     * Parses HTML line by line and extracts any URLs found
+     */
+    private Set<String> extractLinks(Document document, String pageUrl) throws URISyntaxException {
+        Set<String> links = new HashSet<>();
+        URI baseUri = new URI(pageUrl);
+        String baseDomain = baseUri.getHost();
+
+        // Get the full HTML as text and split by lines
+        String htmlContent = document.html();
+        String[] lines = htmlContent.split("\n");
+
+        // Pattern to match URLs in HTML
+        java.util.regex.Pattern urlPattern = java.util.regex.Pattern.compile(
+                "https?://[^\\s\"'<>)]+");
+
+        for (String line : lines) {
+            // Look for any URLs in this line
+            java.util.regex.Matcher matcher = urlPattern.matcher(line);
+
+            while (matcher.find()) {
+                String href = matcher.group();
+
+                if (href.isEmpty() || !href.startsWith("http")) {
+                    continue;
+                }
+
+                try {
+                    URI linkUri = new URI(href);
+                    String linkHost = linkUri.getHost();
+
+                    // Only crawl links from the same domain
+                    if (linkHost != null && linkHost.equals(baseDomain)) {
+                        links.add(href);
+                    }
+                } catch (URISyntaxException e) {
+                    // Skip invalid URLs
+                }
+            }
+        }
+
+        return links;
+    }
+
+    /**
+     * Save discovered URLs to a file with timestamp
+     */
+    private synchronized void saveUrlsToFile(Set<String> urls, String sourceUrl, int depth) {
+        try {
+            String filepath = crawler.getOutputDir() + "/" + urlsFilename;
+
+            // Append URLs to file (create if doesn't exist)
+            try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(filepath),
+                    StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
+
+                writer.write("=== Found on: " + sourceUrl + " (depth: " + depth + ") ===\n");
+                for (String url : urls) {
+                    writer.write(url + "\n");
+                }
+                writer.write("\n");
+            }
+
+            System.out.println(
+                    "[" + Thread.currentThread().getName() + "] Appended " + urls.size() + " URLs to: " + urlsFilename);
+        } catch (IOException e) {
+            System.err.println("[" + Thread.currentThread().getName() + "] Failed to save URLs: " + e.getMessage());
+        }
+    }
+}
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
@@ -0,0 +1,153 @@
+package com.tinsae.crawler;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Multithreaded Web Crawler
+ * - Starts from a root URL
+ * - Spawns threads to crawl links found in pages
+ * - Stores content locally
+ * - Respects max threads and max depth limits
+ * 
+ * @author Tinsae Ghilay
+ */
+public class Crawler {
+
+    private final String rootUrl;
+    private final int maxThreads;
+    private final int maxDepth;
+    private final String outputDir;
+
+    // Thread-safe collections
+    private final Set<String> visitedUrls = Collections.synchronizedSet(new HashSet<>());
+    private final ExecutorService executorService;
+    private final AtomicInteger activeThreads = new AtomicInteger(0);
+    private final AtomicInteger totalThreadsStarted = new AtomicInteger(0);
+
+    public Crawler(String rootUrl, int maxThreads, int maxDepth, String outputDir) {
+        this.rootUrl = rootUrl;
+        this.maxThreads = maxThreads;
+        this.maxDepth = maxDepth;
+        this.outputDir = outputDir;
+        this.executorService = Executors.newFixedThreadPool(maxThreads);
+
+        // Create output directory if it doesn't exist
+        try {
+            Files.createDirectories(Paths.get(outputDir));
+            System.out.println("Output directory created: " + outputDir);
+        } catch (IOException e) {
+            System.err.println("Failed to create output directory: " + e.getMessage());
+        }
+    }
+
+    /**
+     * Start crawling from the root URL
+     */
+    public void start() throws MalformedURLException {
+        System.out.println("Starting crawler...");
+        System.out.println("Root URL: " + rootUrl);
+        System.out.println("Max Threads: " + maxThreads);
+        System.out.println("Max Depth: " + maxDepth);
+        System.out.println("Output Directory: " + outputDir);
+        System.out.println("-------------------------------------------");
+
+        // Submit the first task
+        CrawlTask rootTask = new CrawlTask(rootUrl, 0, this);
+        submitTask(rootTask);
+
+        // Wait for all active threads to finish before shutting down
+        while (activeThreads.get() > 0 || totalThreadsStarted.get() == 0) {
+            try {
+                Thread.sleep(500);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                break;
+            }
+        }
+
+        // Now shutdown the executor
+        executorService.shutdown();
+        try {
+            // Wait up to 5 minutes for completion
+            if (!executorService.awaitTermination(5, TimeUnit.MINUTES)) {
+                System.out.println("\nTimeout: Stopping crawler...");
+                executorService.shutdownNow();
+            }
+        } catch (InterruptedException e) {
+            System.err.println("Crawler interrupted: " + e.getMessage());
+            executorService.shutdownNow();
+            Thread.currentThread().interrupt();
+        }
+
+        System.out.println("\n-------------------------------------------");
+        System.out.println("Crawling complete!");
+        System.out.println("Total URLs visited: " + visitedUrls.size());
+        System.out.println("Total threads started: " + totalThreadsStarted.get());
+        System.out.println("Max depth reached: " + maxDepth);
+    }
+
+    /**
+     * Submit a task to the executor if conditions are met
+     */
+    synchronized void submitTask(CrawlTask task) {
+        // Check if URL was already visited
+        if (visitedUrls.contains(task.getUrl())) {
+            return;
+        }
+
+        // Check if max depth exceeded
+        if (task.getDepth() > maxDepth) {
+            return;
+        }
+
+        // Check if max threads reached (but allow some overflow for queued tasks)
+        if (totalThreadsStarted.get() >= maxThreads) {
+            System.out.println("[Crawler] Max threads reached (" + maxThreads + "). Stopping new crawls.");
+            return;
+        }
+
+        // Mark as visited and submit
+        visitedUrls.add(task.getUrl());
+        totalThreadsStarted.incrementAndGet();
+        activeThreads.incrementAndGet();
+
+        executorService.submit(() -> {
+            try {
+                task.run();
+            } finally {
+                activeThreads.decrementAndGet();
+            }
+        });
+
+        System.out.println("[Crawler] Submitted task #" + totalThreadsStarted.get() + " for: " + task.getUrl()
+                + " (depth: " + task.getDepth() + ")");
+    }
+
+    /**
+     * Get the set of visited URLs (thread-safe)
+     */
+    Set<String> getVisitedUrls() {
+        return visitedUrls;
+    }
+
+    /**
+     * Get the output directory
+     */
+    String getOutputDir() {
+        return outputDir;
+    }
+
+    /**
+     * Get max depth limit
+     */
+    int getMaxDepth() {
+        return maxDepth;
+    }
+}
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
@@ -0,0 +1,22 @@
+package com.tinsae.crawler;
+
+import java.net.MalformedURLException;
+
+public class Main {
+    public static void main(String[] args) {
+        // Configuration
+        String rootUrl = "https://crawler-test.com/";
+        rootUrl = "https://goodnews.eu/";
+        int maxThreads = 1500;
+        int maxDepth = 5;
+        String outputDir = "crawled_content";
+
+        try {
+            Crawler crawler = new Crawler(rootUrl, maxThreads, maxDepth, outputDir);
+            crawler.start();
+        } catch (MalformedURLException e) {
+            System.err.println("Invalid root URL: " + e.getMessage());
+        }
+    }
+
+}
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/CrawlTask.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/CrawlTask.class
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Crawler.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Crawler.class
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Main.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Main.class
--- a/week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
+++ b/week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
--- a/week4_TinsaeGhilay/crawler/target/maven-archiver/pom.properties
+++ b/week4_TinsaeGhilay/crawler/target/maven-archiver/pom.properties
@@ -0,0 +1,3 @@
+artifactId=crawler
+groupId=com.tinsae.crawler
+version=1.0-SNAPSHOT
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
@@ -0,0 +1,3 @@
+com/tinsae/crawler/Crawler.class
+com/tinsae/crawler/CrawlTask.class
+com/tinsae/crawler/Main.class
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
@@ -0,0 +1,3 @@
+/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
+/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
+/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst