started ex_4

2026-01-10 00:54:48 +01:00
parent ec4234ee13
commit a12ad2c0e4
16 changed files with 39013 additions and 0 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -53,6 +53,15 @@
            "preLaunchTask": "build-task5",
            "console": "integratedTerminal"
        },
        {
            "type": "java",
            "name": "Run Crawler (Week 4)",
            "request": "launch",
            "cwd": "${workspaceFolder}/week4_TinsaeGhilay/crawler",
            "projectName": "crawler",
            "mainClass": "com.tinsae.crawler.Main",
            "console": "integratedTerminal"
        },
        {
            "type": "java",
            "name": "Run HelloWorldServer (Task 7/example)",
--- a/week3_TinsaeGhilay.zip
+++ b/week3_TinsaeGhilay.zip
--- a/week4_TinsaeGhilay/crawler/crawled_content/crawled_urls_1768002752389.txt
+++ b/week4_TinsaeGhilay/crawler/crawled_content/crawled_urls_1768002752389.txt
--- a/week4_TinsaeGhilay/crawler/pom.xml
+++ b/week4_TinsaeGhilay/crawler/pom.xml
@@ -0,0 +1,38 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.tinsae.crawler</groupId>
    <artifactId>crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>17</maven.compiler.source>
        <maven.compiler.target>17</maven.compiler.target>
    </properties>
    <dependencies>
        <!-- JSoup for HTML parsing -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.15.3</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>exec-maven-plugin</artifactId>
                <version>3.1.0</version>
                <configuration>
                    <mainClass>com.tinsae.crawler.Main</mainClass>
                </configuration>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
@@ -0,0 +1,160 @@
 package com.tinsae.crawler;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.util.HashSet;
 import java.util.Set;
 /**
 * CrawlTask represents a single URL crawl operation
 * Runs in a separate thread, fetches content, extracts links,
 * appends URLs to file, and submits new tasks for discovered links.
 */
 public class CrawlTask implements Runnable {
    private final String url;
    private final int depth;
    private final Crawler crawler;
    private static final String urlsFilename = "crawled_urls_" + System.currentTimeMillis() + ".txt";
    public CrawlTask(String url, int depth, Crawler crawler) {
        this.url = url;
        this.depth = depth;
        this.crawler = crawler;
    }
    public String getUrl() {
        return url;
    }
    public int getDepth() {
        return depth;
    }
    @Override
    public void run() {
        System.out.println("[" + Thread.currentThread().getName() + "] Crawling: " + url + " (depth: " + depth + ")");
        try {
            // Fetch and parse the page
            Document document = fetchPage(url);
            if (document == null) {
                System.err.println("[" + Thread.currentThread().getName() + "] Failed to fetch: " + url);
                return;
            }
            // Extract links if we haven't reached max depth
            if (depth < crawler.getMaxDepth()) {
                Set<String> links = extractLinks(document, url);
                System.out.println(
                        "[" + Thread.currentThread().getName() + "] Found " + links.size() + " links on " + url);
                // Save URLs to file
                if (!links.isEmpty()) {
                    saveUrlsToFile(links, url, depth);
                }
                // Submit tasks for each discovered link
                for (String link : links) {
                    CrawlTask newTask = new CrawlTask(link, depth + 1, crawler);
                    crawler.submitTask(newTask);
                }
            } else {
                System.out.println("[" + Thread.currentThread().getName() + "] Max depth reached for: " + url);
            }
        } catch (Exception e) {
            System.err.println(
                    "[" + Thread.currentThread().getName() + "] Error crawling " + url + ": " + e.getMessage());
        }
    }
    /**
     * Fetch a web page and return as JSoup Document
     */
    private Document fetchPage(String url) throws IOException {
        // Add timeout and user agent for responsible crawling
        return Jsoup.connect(url)
                .timeout(10000) // 10 second timeout
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
                .get();
    }
    /**
     * Extract all links from a page that are within the same domain
     * Parses HTML line by line and extracts any URLs found
     */
    private Set<String> extractLinks(Document document, String pageUrl) throws URISyntaxException {
        Set<String> links = new HashSet<>();
        URI baseUri = new URI(pageUrl);
        String baseDomain = baseUri.getHost();
        // Get the full HTML as text and split by lines
        String htmlContent = document.html();
        String[] lines = htmlContent.split("\n");
        // Pattern to match URLs in HTML
        java.util.regex.Pattern urlPattern = java.util.regex.Pattern.compile(
                "https?://[^\\s\"'<>)]+");
        for (String line : lines) {
            // Look for any URLs in this line
            java.util.regex.Matcher matcher = urlPattern.matcher(line);
            while (matcher.find()) {
                String href = matcher.group();
                if (href.isEmpty() || !href.startsWith("http")) {
                    continue;
                }
                try {
                    URI linkUri = new URI(href);
                    String linkHost = linkUri.getHost();
                    // Only crawl links from the same domain
                    if (linkHost != null && linkHost.equals(baseDomain)) {
                        links.add(href);
                    }
                } catch (URISyntaxException e) {
                    // Skip invalid URLs
                }
            }
        }
        return links;
    }
    /**
     * Save discovered URLs to a file with timestamp
     */
    private synchronized void saveUrlsToFile(Set<String> urls, String sourceUrl, int depth) {
        try {
            String filepath = crawler.getOutputDir() + "/" + urlsFilename;
            // Append URLs to file (create if doesn't exist)
            try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(filepath),
                    StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
                writer.write("=== Found on: " + sourceUrl + " (depth: " + depth + ") ===\n");
                for (String url : urls) {
                    writer.write(url + "\n");
                }
                writer.write("\n");
            }
            System.out.println(
                    "[" + Thread.currentThread().getName() + "] Appended " + urls.size() + " URLs to: " + urlsFilename);
        } catch (IOException e) {
            System.err.println("[" + Thread.currentThread().getName() + "] Failed to save URLs: " + e.getMessage());
        }
    }
 }
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
@@ -0,0 +1,153 @@
 package com.tinsae.crawler;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 /**
 * Multithreaded Web Crawler
 * - Starts from a root URL
 * - Spawns threads to crawl links found in pages
 * - Stores content locally
 * - Respects max threads and max depth limits
 * 
 * @author Tinsae Ghilay
 */
 public class Crawler {
    private final String rootUrl;
    private final int maxThreads;
    private final int maxDepth;
    private final String outputDir;
    // Thread-safe collections
    private final Set<String> visitedUrls = Collections.synchronizedSet(new HashSet<>());
    private final ExecutorService executorService;
    private final AtomicInteger activeThreads = new AtomicInteger(0);
    private final AtomicInteger totalThreadsStarted = new AtomicInteger(0);
    public Crawler(String rootUrl, int maxThreads, int maxDepth, String outputDir) {
        this.rootUrl = rootUrl;
        this.maxThreads = maxThreads;
        this.maxDepth = maxDepth;
        this.outputDir = outputDir;
        this.executorService = Executors.newFixedThreadPool(maxThreads);
        // Create output directory if it doesn't exist
        try {
            Files.createDirectories(Paths.get(outputDir));
            System.out.println("Output directory created: " + outputDir);
        } catch (IOException e) {
            System.err.println("Failed to create output directory: " + e.getMessage());
        }
    }
    /**
     * Start crawling from the root URL
     */
    public void start() throws MalformedURLException {
        System.out.println("Starting crawler...");
        System.out.println("Root URL: " + rootUrl);
        System.out.println("Max Threads: " + maxThreads);
        System.out.println("Max Depth: " + maxDepth);
        System.out.println("Output Directory: " + outputDir);
        System.out.println("-------------------------------------------");
        // Submit the first task
        CrawlTask rootTask = new CrawlTask(rootUrl, 0, this);
        submitTask(rootTask);
        // Wait for all active threads to finish before shutting down
        while (activeThreads.get() > 0 || totalThreadsStarted.get() == 0) {
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                break;
            }
        }
        // Now shutdown the executor
        executorService.shutdown();
        try {
            // Wait up to 5 minutes for completion
            if (!executorService.awaitTermination(5, TimeUnit.MINUTES)) {
                System.out.println("\nTimeout: Stopping crawler...");
                executorService.shutdownNow();
            }
        } catch (InterruptedException e) {
            System.err.println("Crawler interrupted: " + e.getMessage());
            executorService.shutdownNow();
            Thread.currentThread().interrupt();
        }
        System.out.println("\n-------------------------------------------");
        System.out.println("Crawling complete!");
        System.out.println("Total URLs visited: " + visitedUrls.size());
        System.out.println("Total threads started: " + totalThreadsStarted.get());
        System.out.println("Max depth reached: " + maxDepth);
    }
    /**
     * Submit a task to the executor if conditions are met
     */
    synchronized void submitTask(CrawlTask task) {
        // Check if URL was already visited
        if (visitedUrls.contains(task.getUrl())) {
            return;
        }
        // Check if max depth exceeded
        if (task.getDepth() > maxDepth) {
            return;
        }
        // Check if max threads reached (but allow some overflow for queued tasks)
        if (totalThreadsStarted.get() >= maxThreads) {
            System.out.println("[Crawler] Max threads reached (" + maxThreads + "). Stopping new crawls.");
            return;
        }
        // Mark as visited and submit
        visitedUrls.add(task.getUrl());
        totalThreadsStarted.incrementAndGet();
        activeThreads.incrementAndGet();
        executorService.submit(() -> {
            try {
                task.run();
            } finally {
                activeThreads.decrementAndGet();
            }
        });
        System.out.println("[Crawler] Submitted task #" + totalThreadsStarted.get() + " for: " + task.getUrl()
                + " (depth: " + task.getDepth() + ")");
    }
    /**
     * Get the set of visited URLs (thread-safe)
     */
    Set<String> getVisitedUrls() {
        return visitedUrls;
    }
    /**
     * Get the output directory
     */
    String getOutputDir() {
        return outputDir;
    }
    /**
     * Get max depth limit
     */
    int getMaxDepth() {
        return maxDepth;
    }
 }
--- a/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
+++ b/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
@@ -0,0 +1,22 @@
 package com.tinsae.crawler;
 import java.net.MalformedURLException;
 public class Main {
    public static void main(String[] args) {
        // Configuration
        String rootUrl = "https://crawler-test.com/";
        rootUrl = "https://goodnews.eu/";
        int maxThreads = 1500;
        int maxDepth = 5;
        String outputDir = "crawled_content";
        try {
            Crawler crawler = new Crawler(rootUrl, maxThreads, maxDepth, outputDir);
            crawler.start();
        } catch (MalformedURLException e) {
            System.err.println("Invalid root URL: " + e.getMessage());
        }
    }
 }
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/CrawlTask.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/CrawlTask.class
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Crawler.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Crawler.class
--- a/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Main.class
+++ b/week4_TinsaeGhilay/crawler/target/classes/com/tinsae/crawler/Main.class
--- a/week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
+++ b/week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
--- a/week4_TinsaeGhilay/crawler/target/maven-archiver/pom.properties
+++ b/week4_TinsaeGhilay/crawler/target/maven-archiver/pom.properties
@@ -0,0 +1,3 @@
 artifactId=crawler
 groupId=com.tinsae.crawler
 version=1.0-SNAPSHOT
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
@@ -0,0 +1,3 @@
 com/tinsae/crawler/Crawler.class
 com/tinsae/crawler/CrawlTask.class
 com/tinsae/crawler/Main.class
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
@@ -0,0 +1,3 @@
 /home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
 /home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
 /home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
--- a/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
+++ b/week4_TinsaeGhilay/crawler/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst