Compare commits

..

2 Commits

Author SHA1 Message Date
8714eadba0 removed junk files 2026-01-10 22:30:52 +01:00
fa16ff09aa done 2026-01-10 22:30:04 +01:00
13 changed files with 388 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.tinsae.crawler</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
</properties>
<dependencies>
<!-- JSoup for HTML parsing -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<mainClass>com.tinsae.crawler.Main</mainClass>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,163 @@
package com.tinsae.crawler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.BufferedWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.HashSet;
import java.util.Set;
/**
* CrawlTask represents a single URL crawl operation
* Runs in a separate thread, fetches content, extracts links,
* appends URLs to file, and submits new tasks for discovered links.
*/
public class CrawlTask implements Runnable {
private final String url;
private final int depth;
private final Crawler crawler;
private static final String urlsFilename = "crawled_urls_" + System.currentTimeMillis() + ".txt";
public CrawlTask(String url, int depth, Crawler crawler) {
this.url = url;
this.depth = depth;
this.crawler = crawler;
}
public String getUrl() {
return url;
}
public int getDepth() {
return depth;
}
@Override
public void run() {
System.out.println("[" + Thread.currentThread().getName() + "] Crawling: " + url + " (depth: " + depth + ")");
try {
// Fetch and parse the page
Document document = fetchPage(url);
if (document == null) {
System.err.println("[" + Thread.currentThread().getName() + "] Failed to fetch: " + url);
return;
}
// Extract links if we haven't reached max depth
if (depth < crawler.getMaxDepth()) {
Set<String> links = extractLinks(document, url);
System.out.println(
"[" + Thread.currentThread().getName() + "] Found " + links.size() + " links on " + url);
// Save URLs to file
if (!links.isEmpty()) {
saveUrlsToFile(links, url, depth);
}
// Submit tasks for each discovered link
for (String link : links) {
CrawlTask newTask = new CrawlTask(link, depth + 1, crawler);
crawler.submitTask(newTask);
}
} else {
System.out.println("[" + Thread.currentThread().getName() + "] Max depth reached for: " + url);
}
} catch (Exception e) {
System.err.println(
"[" + Thread.currentThread().getName() + "] Error crawling " + url + ": " + e.getMessage());
}
}
/**
* Fetch a web page and return as JSoup Document
*/
private Document fetchPage(String url) throws IOException {
// Add timeout and user agent for responsible crawling
return Jsoup.connect(url)
.timeout(10000) // 10 second timeout
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.get();
}
/**
* Extract all links from a page that are within the same domain
* Parses HTML line by line and extracts any URLs found
*/
private Set<String> extractLinks(Document document, String pageUrl) throws URISyntaxException {
Set<String> links = new HashSet<>();
URI baseUri = new URI(pageUrl);
String baseDomain = baseUri.getHost();
// Get the full HTML as text and split by lines
String htmlContent = document.html();
String[] lines = htmlContent.split("\n");
// Pattern to match URLs in HTML
java.util.regex.Pattern urlPattern = java.util.regex.Pattern.compile(
"https?://[^\\s\"'<>)]+");
for (String line : lines) {
// Look for any URLs in this line
java.util.regex.Matcher matcher = urlPattern.matcher(line);
while (matcher.find()) {
String href = matcher.group();
if (href.isEmpty() || !href.startsWith("http")) {
continue;
}
try {
URI linkUri = new URI(href);
String linkHost = linkUri.getHost();
links.add(href);
// Only crawl links from the same domain
/*
* if (linkHost != null && linkHost.equals(baseDomain)) {
* links.add(href);
* }
*/
} catch (URISyntaxException e) {
// Skip invalid URLs
}
}
}
return links;
}
/**
* Save discovered URLs to a file with timestamp
*/
private synchronized void saveUrlsToFile(Set<String> urls, String sourceUrl, int depth) {
try {
String filepath = crawler.getOutputDir() + "/" + urlsFilename;
// Append URLs to file (create if doesn't exist)
try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(filepath),
StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
writer.write("=== Found on: " + sourceUrl + " (depth: " + depth + ") ===\n");
for (String url : urls) {
writer.write(url + "\n");
}
writer.write("\n");
}
System.out.println(
"[" + Thread.currentThread().getName() + "] Appended " + urls.size() + " URLs to: " + urlsFilename);
} catch (IOException e) {
System.err.println("[" + Thread.currentThread().getName() + "] Failed to save URLs: " + e.getMessage());
}
}
}

View File

@@ -0,0 +1,156 @@
package com.tinsae.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Multithreaded Web Crawler
* - Starts from a root URL
* - Spawns threads to crawl links found in pages
* - Stores content locally
* - Respects max threads and max depth limits
*
* @author Tinsae Ghilay
*/
public class Crawler {
private final String rootUrl;
private final int maxThreads;
private final int maxDepth;
private final String outputDir;
// Thread-safe collections
private final Set<String> visitedUrls = Collections.synchronizedSet(new HashSet<>());
private final ExecutorService executorService;
private final AtomicInteger activeThreads = new AtomicInteger(0);
private final AtomicInteger totalThreadsStarted = new AtomicInteger(0);
public Crawler(String rootUrl, int maxThreads, int maxDepth, String outputDir) {
this.rootUrl = rootUrl;
this.maxThreads = maxThreads;
this.maxDepth = maxDepth;
this.outputDir = outputDir;
this.executorService = Executors.newFixedThreadPool(maxThreads);
// Create output directory if it doesn't exist
try {
Files.createDirectories(Paths.get(outputDir));
System.out.println("Output directory created: " + outputDir);
} catch (IOException e) {
System.err.println("Failed to create output directory: " + e.getMessage());
}
}
/**
* Start crawling from the root URL
*/
public void start() throws MalformedURLException {
System.out.println("Starting crawler...");
System.out.println("Root URL: " + rootUrl);
System.out.println("Max Threads: " + maxThreads);
System.out.println("Max Depth: " + maxDepth);
System.out.println("Output Directory: " + outputDir);
System.out.println("-------------------------------------------");
// Submit the first task
CrawlTask rootTask = new CrawlTask(rootUrl, 0, this);
submitTask(rootTask);
// Wait for all active threads to finish before shutting down
while (activeThreads.get() > 0 || totalThreadsStarted.get() == 0) {
try {
Thread.sleep(500);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
// Now shutdown the executor
executorService.shutdown();
try {
// Wait up to 5 minutes for completion
if (!executorService.awaitTermination(5, TimeUnit.MINUTES)) {
System.out.println("\nTimeout: Stopping crawler...");
executorService.shutdownNow();
}
} catch (InterruptedException e) {
System.err.println("Crawler interrupted: " + e.getMessage());
executorService.shutdownNow();
Thread.currentThread().interrupt();
}
System.out.println("\n-------------------------------------------");
System.out.println("Crawling complete!");
System.out.println("Total URLs visited: " + visitedUrls.size());
System.out.println("Total threads started: " + totalThreadsStarted.get());
System.out.println("Max depth reached: " + maxDepth);
}
/**
* Submit a task to the executor if conditions are met
*/
synchronized void submitTask(CrawlTask task) {
// Check if URL was already visited
if (visitedUrls.contains(task.getUrl())) {
return;
}
// Check if max depth exceeded
if (task.getDepth() > maxDepth) {
return;
}
// Check if max threads reached (but allow some overflow for queued tasks)
if (totalThreadsStarted.get() >= maxThreads) {
System.out.println("[Crawler] Max threads reached (" + maxThreads + "). Stopping new crawls.");
return;
}
// Mark as visited and submit
visitedUrls.add(task.getUrl());
// Increment the count of total threads started
totalThreadsStarted.incrementAndGet();
// Increment active thread count because we're about to submit a new task
activeThreads.incrementAndGet();
// Submit the task to the executor
executorService.submit(() -> {
try {
task.run();
} finally {
// decrease active thread count when done
activeThreads.decrementAndGet();
}
});
System.out.println("[Crawler] Submitted task #" + totalThreadsStarted.get() + " for: " + task.getUrl()
+ " (depth: " + task.getDepth() + ")");
}
/**
* Get the set of visited URLs (thread-safe)
*/
Set<String> getVisitedUrls() {
return visitedUrls;
}
/**
* Get the output directory
*/
String getOutputDir() {
return outputDir;
}
/**
* Get max depth limit
*/
int getMaxDepth() {
return maxDepth;
}
}

View File

@@ -0,0 +1,22 @@
package com.tinsae.crawler;
import java.net.MalformedURLException;
public class Main {
public static void main(String[] args) {
// Configuration
String rootUrl = "https://crawler-test.com/";
rootUrl = "https://goodnews.eu/";
int maxThreads = 1500;
int maxDepth = 5;
String outputDir = "crawled_content";
try {
Crawler crawler = new Crawler(rootUrl, maxThreads, maxDepth, outputDir);
crawler.start();
} catch (MalformedURLException e) {
System.err.println("Invalid root URL: " + e.getMessage());
}
}
}

View File

@@ -0,0 +1,3 @@
artifactId=crawler
groupId=com.tinsae.crawler
version=1.0-SNAPSHOT

View File

@@ -0,0 +1,3 @@
com/tinsae/crawler/Crawler.class
com/tinsae/crawler/CrawlTask.class
com/tinsae/crawler/Main.class

View File

@@ -0,0 +1,3 @@
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java