started ex_4
This commit is contained in:
9
.vscode/launch.json
vendored
9
.vscode/launch.json
vendored
@@ -53,6 +53,15 @@
|
||||
"preLaunchTask": "build-task5",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"type": "java",
|
||||
"name": "Run Crawler (Week 4)",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceFolder}/week4_TinsaeGhilay/crawler",
|
||||
"projectName": "crawler",
|
||||
"mainClass": "com.tinsae.crawler.Main",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"type": "java",
|
||||
"name": "Run HelloWorldServer (Task 7/example)",
|
||||
|
||||
BIN
week3_TinsaeGhilay.zip
Normal file
BIN
week3_TinsaeGhilay.zip
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
38
week4_TinsaeGhilay/crawler/pom.xml
Normal file
38
week4_TinsaeGhilay/crawler/pom.xml
Normal file
@@ -0,0 +1,38 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.tinsae.crawler</groupId>
|
||||
<artifactId>crawler</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>17</maven.compiler.source>
|
||||
<maven.compiler.target>17</maven.compiler.target>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<!-- JSoup for HTML parsing -->
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.15.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
<configuration>
|
||||
<mainClass>com.tinsae.crawler.Main</mainClass>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
@@ -0,0 +1,160 @@
|
||||
package com.tinsae.crawler;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* CrawlTask represents a single URL crawl operation
|
||||
* Runs in a separate thread, fetches content, extracts links,
|
||||
* appends URLs to file, and submits new tasks for discovered links.
|
||||
*/
|
||||
public class CrawlTask implements Runnable {
|
||||
|
||||
private final String url;
|
||||
private final int depth;
|
||||
private final Crawler crawler;
|
||||
private static final String urlsFilename = "crawled_urls_" + System.currentTimeMillis() + ".txt";
|
||||
|
||||
public CrawlTask(String url, int depth, Crawler crawler) {
|
||||
this.url = url;
|
||||
this.depth = depth;
|
||||
this.crawler = crawler;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public int getDepth() {
|
||||
return depth;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
System.out.println("[" + Thread.currentThread().getName() + "] Crawling: " + url + " (depth: " + depth + ")");
|
||||
|
||||
try {
|
||||
// Fetch and parse the page
|
||||
Document document = fetchPage(url);
|
||||
if (document == null) {
|
||||
System.err.println("[" + Thread.currentThread().getName() + "] Failed to fetch: " + url);
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract links if we haven't reached max depth
|
||||
if (depth < crawler.getMaxDepth()) {
|
||||
Set<String> links = extractLinks(document, url);
|
||||
System.out.println(
|
||||
"[" + Thread.currentThread().getName() + "] Found " + links.size() + " links on " + url);
|
||||
|
||||
// Save URLs to file
|
||||
if (!links.isEmpty()) {
|
||||
saveUrlsToFile(links, url, depth);
|
||||
}
|
||||
|
||||
// Submit tasks for each discovered link
|
||||
for (String link : links) {
|
||||
CrawlTask newTask = new CrawlTask(link, depth + 1, crawler);
|
||||
crawler.submitTask(newTask);
|
||||
}
|
||||
} else {
|
||||
System.out.println("[" + Thread.currentThread().getName() + "] Max depth reached for: " + url);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
System.err.println(
|
||||
"[" + Thread.currentThread().getName() + "] Error crawling " + url + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a web page and return as JSoup Document
|
||||
*/
|
||||
private Document fetchPage(String url) throws IOException {
|
||||
// Add timeout and user agent for responsible crawling
|
||||
return Jsoup.connect(url)
|
||||
.timeout(10000) // 10 second timeout
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||||
.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all links from a page that are within the same domain
|
||||
* Parses HTML line by line and extracts any URLs found
|
||||
*/
|
||||
private Set<String> extractLinks(Document document, String pageUrl) throws URISyntaxException {
|
||||
Set<String> links = new HashSet<>();
|
||||
URI baseUri = new URI(pageUrl);
|
||||
String baseDomain = baseUri.getHost();
|
||||
|
||||
// Get the full HTML as text and split by lines
|
||||
String htmlContent = document.html();
|
||||
String[] lines = htmlContent.split("\n");
|
||||
|
||||
// Pattern to match URLs in HTML
|
||||
java.util.regex.Pattern urlPattern = java.util.regex.Pattern.compile(
|
||||
"https?://[^\\s\"'<>)]+");
|
||||
|
||||
for (String line : lines) {
|
||||
// Look for any URLs in this line
|
||||
java.util.regex.Matcher matcher = urlPattern.matcher(line);
|
||||
|
||||
while (matcher.find()) {
|
||||
String href = matcher.group();
|
||||
|
||||
if (href.isEmpty() || !href.startsWith("http")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
URI linkUri = new URI(href);
|
||||
String linkHost = linkUri.getHost();
|
||||
|
||||
// Only crawl links from the same domain
|
||||
if (linkHost != null && linkHost.equals(baseDomain)) {
|
||||
links.add(href);
|
||||
}
|
||||
} catch (URISyntaxException e) {
|
||||
// Skip invalid URLs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save discovered URLs to a file with timestamp
|
||||
*/
|
||||
private synchronized void saveUrlsToFile(Set<String> urls, String sourceUrl, int depth) {
|
||||
try {
|
||||
String filepath = crawler.getOutputDir() + "/" + urlsFilename;
|
||||
|
||||
// Append URLs to file (create if doesn't exist)
|
||||
try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(filepath),
|
||||
StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
|
||||
|
||||
writer.write("=== Found on: " + sourceUrl + " (depth: " + depth + ") ===\n");
|
||||
for (String url : urls) {
|
||||
writer.write(url + "\n");
|
||||
}
|
||||
writer.write("\n");
|
||||
}
|
||||
|
||||
System.out.println(
|
||||
"[" + Thread.currentThread().getName() + "] Appended " + urls.size() + " URLs to: " + urlsFilename);
|
||||
} catch (IOException e) {
|
||||
System.err.println("[" + Thread.currentThread().getName() + "] Failed to save URLs: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
package com.tinsae.crawler;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Multithreaded Web Crawler
|
||||
* - Starts from a root URL
|
||||
* - Spawns threads to crawl links found in pages
|
||||
* - Stores content locally
|
||||
* - Respects max threads and max depth limits
|
||||
*
|
||||
* @author Tinsae Ghilay
|
||||
*/
|
||||
public class Crawler {
|
||||
|
||||
private final String rootUrl;
|
||||
private final int maxThreads;
|
||||
private final int maxDepth;
|
||||
private final String outputDir;
|
||||
|
||||
// Thread-safe collections
|
||||
private final Set<String> visitedUrls = Collections.synchronizedSet(new HashSet<>());
|
||||
private final ExecutorService executorService;
|
||||
private final AtomicInteger activeThreads = new AtomicInteger(0);
|
||||
private final AtomicInteger totalThreadsStarted = new AtomicInteger(0);
|
||||
|
||||
public Crawler(String rootUrl, int maxThreads, int maxDepth, String outputDir) {
|
||||
this.rootUrl = rootUrl;
|
||||
this.maxThreads = maxThreads;
|
||||
this.maxDepth = maxDepth;
|
||||
this.outputDir = outputDir;
|
||||
this.executorService = Executors.newFixedThreadPool(maxThreads);
|
||||
|
||||
// Create output directory if it doesn't exist
|
||||
try {
|
||||
Files.createDirectories(Paths.get(outputDir));
|
||||
System.out.println("Output directory created: " + outputDir);
|
||||
} catch (IOException e) {
|
||||
System.err.println("Failed to create output directory: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start crawling from the root URL
|
||||
*/
|
||||
public void start() throws MalformedURLException {
|
||||
System.out.println("Starting crawler...");
|
||||
System.out.println("Root URL: " + rootUrl);
|
||||
System.out.println("Max Threads: " + maxThreads);
|
||||
System.out.println("Max Depth: " + maxDepth);
|
||||
System.out.println("Output Directory: " + outputDir);
|
||||
System.out.println("-------------------------------------------");
|
||||
|
||||
// Submit the first task
|
||||
CrawlTask rootTask = new CrawlTask(rootUrl, 0, this);
|
||||
submitTask(rootTask);
|
||||
|
||||
// Wait for all active threads to finish before shutting down
|
||||
while (activeThreads.get() > 0 || totalThreadsStarted.get() == 0) {
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Now shutdown the executor
|
||||
executorService.shutdown();
|
||||
try {
|
||||
// Wait up to 5 minutes for completion
|
||||
if (!executorService.awaitTermination(5, TimeUnit.MINUTES)) {
|
||||
System.out.println("\nTimeout: Stopping crawler...");
|
||||
executorService.shutdownNow();
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
System.err.println("Crawler interrupted: " + e.getMessage());
|
||||
executorService.shutdownNow();
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
System.out.println("\n-------------------------------------------");
|
||||
System.out.println("Crawling complete!");
|
||||
System.out.println("Total URLs visited: " + visitedUrls.size());
|
||||
System.out.println("Total threads started: " + totalThreadsStarted.get());
|
||||
System.out.println("Max depth reached: " + maxDepth);
|
||||
}
|
||||
|
||||
/**
|
||||
* Submit a task to the executor if conditions are met
|
||||
*/
|
||||
synchronized void submitTask(CrawlTask task) {
|
||||
// Check if URL was already visited
|
||||
if (visitedUrls.contains(task.getUrl())) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if max depth exceeded
|
||||
if (task.getDepth() > maxDepth) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if max threads reached (but allow some overflow for queued tasks)
|
||||
if (totalThreadsStarted.get() >= maxThreads) {
|
||||
System.out.println("[Crawler] Max threads reached (" + maxThreads + "). Stopping new crawls.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Mark as visited and submit
|
||||
visitedUrls.add(task.getUrl());
|
||||
totalThreadsStarted.incrementAndGet();
|
||||
activeThreads.incrementAndGet();
|
||||
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
task.run();
|
||||
} finally {
|
||||
activeThreads.decrementAndGet();
|
||||
}
|
||||
});
|
||||
|
||||
System.out.println("[Crawler] Submitted task #" + totalThreadsStarted.get() + " for: " + task.getUrl()
|
||||
+ " (depth: " + task.getDepth() + ")");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of visited URLs (thread-safe)
|
||||
*/
|
||||
Set<String> getVisitedUrls() {
|
||||
return visitedUrls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the output directory
|
||||
*/
|
||||
String getOutputDir() {
|
||||
return outputDir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get max depth limit
|
||||
*/
|
||||
int getMaxDepth() {
|
||||
return maxDepth;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package com.tinsae.crawler;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
// Configuration
|
||||
String rootUrl = "https://crawler-test.com/";
|
||||
rootUrl = "https://goodnews.eu/";
|
||||
int maxThreads = 1500;
|
||||
int maxDepth = 5;
|
||||
String outputDir = "crawled_content";
|
||||
|
||||
try {
|
||||
Crawler crawler = new Crawler(rootUrl, maxThreads, maxDepth, outputDir);
|
||||
crawler.start();
|
||||
} catch (MalformedURLException e) {
|
||||
System.err.println("Invalid root URL: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
Normal file
BIN
week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
Normal file
Binary file not shown.
@@ -0,0 +1,3 @@
|
||||
artifactId=crawler
|
||||
groupId=com.tinsae.crawler
|
||||
version=1.0-SNAPSHOT
|
||||
@@ -0,0 +1,3 @@
|
||||
com/tinsae/crawler/Crawler.class
|
||||
com/tinsae/crawler/CrawlTask.class
|
||||
com/tinsae/crawler/Main.class
|
||||
@@ -0,0 +1,3 @@
|
||||
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
|
||||
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
|
||||
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
|
||||
Reference in New Issue
Block a user