started ex_4
This commit is contained in:
9
.vscode/launch.json
vendored
9
.vscode/launch.json
vendored
@@ -53,6 +53,15 @@
|
|||||||
"preLaunchTask": "build-task5",
|
"preLaunchTask": "build-task5",
|
||||||
"console": "integratedTerminal"
|
"console": "integratedTerminal"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"type": "java",
|
||||||
|
"name": "Run Crawler (Week 4)",
|
||||||
|
"request": "launch",
|
||||||
|
"cwd": "${workspaceFolder}/week4_TinsaeGhilay/crawler",
|
||||||
|
"projectName": "crawler",
|
||||||
|
"mainClass": "com.tinsae.crawler.Main",
|
||||||
|
"console": "integratedTerminal"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"type": "java",
|
"type": "java",
|
||||||
"name": "Run HelloWorldServer (Task 7/example)",
|
"name": "Run HelloWorldServer (Task 7/example)",
|
||||||
|
|||||||
BIN
week3_TinsaeGhilay.zip
Normal file
BIN
week3_TinsaeGhilay.zip
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
38
week4_TinsaeGhilay/crawler/pom.xml
Normal file
38
week4_TinsaeGhilay/crawler/pom.xml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>com.tinsae.crawler</groupId>
|
||||||
|
<artifactId>crawler</artifactId>
|
||||||
|
<version>1.0-SNAPSHOT</version>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.source>17</maven.compiler.source>
|
||||||
|
<maven.compiler.target>17</maven.compiler.target>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<!-- JSoup for HTML parsing -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.15.3</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>exec-maven-plugin</artifactId>
|
||||||
|
<version>3.1.0</version>
|
||||||
|
<configuration>
|
||||||
|
<mainClass>com.tinsae.crawler.Main</mainClass>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,160 @@
|
|||||||
|
package com.tinsae.crawler;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CrawlTask represents a single URL crawl operation
|
||||||
|
* Runs in a separate thread, fetches content, extracts links,
|
||||||
|
* appends URLs to file, and submits new tasks for discovered links.
|
||||||
|
*/
|
||||||
|
public class CrawlTask implements Runnable {
|
||||||
|
|
||||||
|
private final String url;
|
||||||
|
private final int depth;
|
||||||
|
private final Crawler crawler;
|
||||||
|
private static final String urlsFilename = "crawled_urls_" + System.currentTimeMillis() + ".txt";
|
||||||
|
|
||||||
|
public CrawlTask(String url, int depth, Crawler crawler) {
|
||||||
|
this.url = url;
|
||||||
|
this.depth = depth;
|
||||||
|
this.crawler = crawler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUrl() {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getDepth() {
|
||||||
|
return depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
System.out.println("[" + Thread.currentThread().getName() + "] Crawling: " + url + " (depth: " + depth + ")");
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Fetch and parse the page
|
||||||
|
Document document = fetchPage(url);
|
||||||
|
if (document == null) {
|
||||||
|
System.err.println("[" + Thread.currentThread().getName() + "] Failed to fetch: " + url);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract links if we haven't reached max depth
|
||||||
|
if (depth < crawler.getMaxDepth()) {
|
||||||
|
Set<String> links = extractLinks(document, url);
|
||||||
|
System.out.println(
|
||||||
|
"[" + Thread.currentThread().getName() + "] Found " + links.size() + " links on " + url);
|
||||||
|
|
||||||
|
// Save URLs to file
|
||||||
|
if (!links.isEmpty()) {
|
||||||
|
saveUrlsToFile(links, url, depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Submit tasks for each discovered link
|
||||||
|
for (String link : links) {
|
||||||
|
CrawlTask newTask = new CrawlTask(link, depth + 1, crawler);
|
||||||
|
crawler.submitTask(newTask);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
System.out.println("[" + Thread.currentThread().getName() + "] Max depth reached for: " + url);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println(
|
||||||
|
"[" + Thread.currentThread().getName() + "] Error crawling " + url + ": " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch a web page and return as JSoup Document
|
||||||
|
*/
|
||||||
|
private Document fetchPage(String url) throws IOException {
|
||||||
|
// Add timeout and user agent for responsible crawling
|
||||||
|
return Jsoup.connect(url)
|
||||||
|
.timeout(10000) // 10 second timeout
|
||||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||||||
|
.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all links from a page that are within the same domain
|
||||||
|
* Parses HTML line by line and extracts any URLs found
|
||||||
|
*/
|
||||||
|
private Set<String> extractLinks(Document document, String pageUrl) throws URISyntaxException {
|
||||||
|
Set<String> links = new HashSet<>();
|
||||||
|
URI baseUri = new URI(pageUrl);
|
||||||
|
String baseDomain = baseUri.getHost();
|
||||||
|
|
||||||
|
// Get the full HTML as text and split by lines
|
||||||
|
String htmlContent = document.html();
|
||||||
|
String[] lines = htmlContent.split("\n");
|
||||||
|
|
||||||
|
// Pattern to match URLs in HTML
|
||||||
|
java.util.regex.Pattern urlPattern = java.util.regex.Pattern.compile(
|
||||||
|
"https?://[^\\s\"'<>)]+");
|
||||||
|
|
||||||
|
for (String line : lines) {
|
||||||
|
// Look for any URLs in this line
|
||||||
|
java.util.regex.Matcher matcher = urlPattern.matcher(line);
|
||||||
|
|
||||||
|
while (matcher.find()) {
|
||||||
|
String href = matcher.group();
|
||||||
|
|
||||||
|
if (href.isEmpty() || !href.startsWith("http")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
URI linkUri = new URI(href);
|
||||||
|
String linkHost = linkUri.getHost();
|
||||||
|
|
||||||
|
// Only crawl links from the same domain
|
||||||
|
if (linkHost != null && linkHost.equals(baseDomain)) {
|
||||||
|
links.add(href);
|
||||||
|
}
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
// Skip invalid URLs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save discovered URLs to a file with timestamp
|
||||||
|
*/
|
||||||
|
private synchronized void saveUrlsToFile(Set<String> urls, String sourceUrl, int depth) {
|
||||||
|
try {
|
||||||
|
String filepath = crawler.getOutputDir() + "/" + urlsFilename;
|
||||||
|
|
||||||
|
// Append URLs to file (create if doesn't exist)
|
||||||
|
try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(filepath),
|
||||||
|
StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
|
||||||
|
|
||||||
|
writer.write("=== Found on: " + sourceUrl + " (depth: " + depth + ") ===\n");
|
||||||
|
for (String url : urls) {
|
||||||
|
writer.write(url + "\n");
|
||||||
|
}
|
||||||
|
writer.write("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(
|
||||||
|
"[" + Thread.currentThread().getName() + "] Appended " + urls.size() + " URLs to: " + urlsFilename);
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.err.println("[" + Thread.currentThread().getName() + "] Failed to save URLs: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,153 @@
|
|||||||
|
package com.tinsae.crawler;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.concurrent.*;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multithreaded Web Crawler
|
||||||
|
* - Starts from a root URL
|
||||||
|
* - Spawns threads to crawl links found in pages
|
||||||
|
* - Stores content locally
|
||||||
|
* - Respects max threads and max depth limits
|
||||||
|
*
|
||||||
|
* @author Tinsae Ghilay
|
||||||
|
*/
|
||||||
|
public class Crawler {
|
||||||
|
|
||||||
|
private final String rootUrl;
|
||||||
|
private final int maxThreads;
|
||||||
|
private final int maxDepth;
|
||||||
|
private final String outputDir;
|
||||||
|
|
||||||
|
// Thread-safe collections
|
||||||
|
private final Set<String> visitedUrls = Collections.synchronizedSet(new HashSet<>());
|
||||||
|
private final ExecutorService executorService;
|
||||||
|
private final AtomicInteger activeThreads = new AtomicInteger(0);
|
||||||
|
private final AtomicInteger totalThreadsStarted = new AtomicInteger(0);
|
||||||
|
|
||||||
|
public Crawler(String rootUrl, int maxThreads, int maxDepth, String outputDir) {
|
||||||
|
this.rootUrl = rootUrl;
|
||||||
|
this.maxThreads = maxThreads;
|
||||||
|
this.maxDepth = maxDepth;
|
||||||
|
this.outputDir = outputDir;
|
||||||
|
this.executorService = Executors.newFixedThreadPool(maxThreads);
|
||||||
|
|
||||||
|
// Create output directory if it doesn't exist
|
||||||
|
try {
|
||||||
|
Files.createDirectories(Paths.get(outputDir));
|
||||||
|
System.out.println("Output directory created: " + outputDir);
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.err.println("Failed to create output directory: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start crawling from the root URL
|
||||||
|
*/
|
||||||
|
public void start() throws MalformedURLException {
|
||||||
|
System.out.println("Starting crawler...");
|
||||||
|
System.out.println("Root URL: " + rootUrl);
|
||||||
|
System.out.println("Max Threads: " + maxThreads);
|
||||||
|
System.out.println("Max Depth: " + maxDepth);
|
||||||
|
System.out.println("Output Directory: " + outputDir);
|
||||||
|
System.out.println("-------------------------------------------");
|
||||||
|
|
||||||
|
// Submit the first task
|
||||||
|
CrawlTask rootTask = new CrawlTask(rootUrl, 0, this);
|
||||||
|
submitTask(rootTask);
|
||||||
|
|
||||||
|
// Wait for all active threads to finish before shutting down
|
||||||
|
while (activeThreads.get() > 0 || totalThreadsStarted.get() == 0) {
|
||||||
|
try {
|
||||||
|
Thread.sleep(500);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now shutdown the executor
|
||||||
|
executorService.shutdown();
|
||||||
|
try {
|
||||||
|
// Wait up to 5 minutes for completion
|
||||||
|
if (!executorService.awaitTermination(5, TimeUnit.MINUTES)) {
|
||||||
|
System.out.println("\nTimeout: Stopping crawler...");
|
||||||
|
executorService.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
System.err.println("Crawler interrupted: " + e.getMessage());
|
||||||
|
executorService.shutdownNow();
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("\n-------------------------------------------");
|
||||||
|
System.out.println("Crawling complete!");
|
||||||
|
System.out.println("Total URLs visited: " + visitedUrls.size());
|
||||||
|
System.out.println("Total threads started: " + totalThreadsStarted.get());
|
||||||
|
System.out.println("Max depth reached: " + maxDepth);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Submit a task to the executor if conditions are met
|
||||||
|
*/
|
||||||
|
synchronized void submitTask(CrawlTask task) {
|
||||||
|
// Check if URL was already visited
|
||||||
|
if (visitedUrls.contains(task.getUrl())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if max depth exceeded
|
||||||
|
if (task.getDepth() > maxDepth) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if max threads reached (but allow some overflow for queued tasks)
|
||||||
|
if (totalThreadsStarted.get() >= maxThreads) {
|
||||||
|
System.out.println("[Crawler] Max threads reached (" + maxThreads + "). Stopping new crawls.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark as visited and submit
|
||||||
|
visitedUrls.add(task.getUrl());
|
||||||
|
totalThreadsStarted.incrementAndGet();
|
||||||
|
activeThreads.incrementAndGet();
|
||||||
|
|
||||||
|
executorService.submit(() -> {
|
||||||
|
try {
|
||||||
|
task.run();
|
||||||
|
} finally {
|
||||||
|
activeThreads.decrementAndGet();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
System.out.println("[Crawler] Submitted task #" + totalThreadsStarted.get() + " for: " + task.getUrl()
|
||||||
|
+ " (depth: " + task.getDepth() + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the set of visited URLs (thread-safe)
|
||||||
|
*/
|
||||||
|
Set<String> getVisitedUrls() {
|
||||||
|
return visitedUrls;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the output directory
|
||||||
|
*/
|
||||||
|
String getOutputDir() {
|
||||||
|
return outputDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get max depth limit
|
||||||
|
*/
|
||||||
|
int getMaxDepth() {
|
||||||
|
return maxDepth;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
package com.tinsae.crawler;
|
||||||
|
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
|
||||||
|
public class Main {
|
||||||
|
public static void main(String[] args) {
|
||||||
|
// Configuration
|
||||||
|
String rootUrl = "https://crawler-test.com/";
|
||||||
|
rootUrl = "https://goodnews.eu/";
|
||||||
|
int maxThreads = 1500;
|
||||||
|
int maxDepth = 5;
|
||||||
|
String outputDir = "crawled_content";
|
||||||
|
|
||||||
|
try {
|
||||||
|
Crawler crawler = new Crawler(rootUrl, maxThreads, maxDepth, outputDir);
|
||||||
|
crawler.start();
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
System.err.println("Invalid root URL: " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
Normal file
BIN
week4_TinsaeGhilay/crawler/target/crawler-1.0-SNAPSHOT.jar
Normal file
Binary file not shown.
@@ -0,0 +1,3 @@
|
|||||||
|
artifactId=crawler
|
||||||
|
groupId=com.tinsae.crawler
|
||||||
|
version=1.0-SNAPSHOT
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
com/tinsae/crawler/Crawler.class
|
||||||
|
com/tinsae/crawler/CrawlTask.class
|
||||||
|
com/tinsae/crawler/Main.class
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/CrawlTask.java
|
||||||
|
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Crawler.java
|
||||||
|
/home/tgk/Repos/Trusted/DistributedSystems/week4_TinsaeGhilay/crawler/src/main/java/com/tinsae/crawler/Main.java
|
||||||
Reference in New Issue
Block a user