Add initial fetching code using Selenium

This commit is contained in:
Ben Kreeger 2025-08-19 18:39:07 -05:00
parent 92d7872439
commit fd25540633
Signed by: kreeger
GPG Key ID: D5CF8683D4BE4B50
11 changed files with 219 additions and 9 deletions

View File

@ -0,0 +1,27 @@
{
"name": "Putty",
"dockerComposeFile": "docker-compose.yaml",
"features": {
"ghcr.io/devcontainers/features/common-utils:2": {
"installZsh": "true",
"username": "user",
"upgradePackages": "false"
},
"ghcr.io/devcontainers/features/git:1": {
"version": "os-provided",
"ppa": "false"
}
},
"runArgs": [
"--cap-add=SYS_PTRACE",
"--security-opt",
"seccomp=unconfined"
],
"service": "workspace",
"workspaceFolder": "/workspace",
"forwardPorts": [
8080
],
"postCreateCommand": "sh ./.devcontainer/postCreateCommand.sh",
"remoteUser": "user"
}

View File

@ -0,0 +1,14 @@
---
services:
workspace:
image: swift:6.1
command: sleep infinity
depends_on: [browser]
volumes: [..:/workspace:cached, build-tmp:/workspace/.build/]
env_file: ../.env
browser:
image: selenium/standalone-chromium:latest
shm_size: 2gb
ports: [4444:4444, 7900:7900]
volumes:
build-tmp: {}

View File

@ -0,0 +1,5 @@
#!/usr/bin/env sh
# postCreateCommand.sh
sudo chown user:user -R /workspace/.build
# TODO: Install swiftformat

1
.dockerignore Normal file
View File

@ -0,0 +1 @@
.build

View File

@ -1 +1 @@
6.2
6.1

1
CLI.d Normal file

File diff suppressed because one or more lines are too long

BIN
CLI.dia Normal file

Binary file not shown.

View File

@ -1,4 +1,4 @@
// swift-tools-version: 6.2
// swift-tools-version: 6.1
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
@ -6,14 +6,20 @@ import PackageDescription
let package = Package(
name: "Putty",
platforms: [.macOS(.v15)],
products: [
.library(name: "PuttyKit", targets: ["PuttyKit"]),
],
dependencies: [
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
],
targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(name: "putty", dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
]),
],
)
.target(name: "PuttyKit"),
.executableTarget(name: "putty",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
.target(name: "PuttyKit"),
],
path: "Sources/CLI"),
])

View File

@ -1,12 +1,12 @@
import ArgumentParser
import PuttyKit
@main
struct CLI: AsyncParsableCommand {
static let configuration: CommandConfiguration = .init(
commandName: "putty",
abstract: "A utility for getting comic data from GoComics.com.",
subcommands: [Scrape.self],
)
subcommands: [Scrape.self])
}
struct Scrape: AsyncParsableCommand {
@ -14,5 +14,8 @@ struct Scrape: AsyncParsableCommand {
mutating func run() async throws {
print("scrape")
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
_ = try await fetcher.fetchAToZ()
print("Fetch complete!")
}
}

View File

@ -0,0 +1,125 @@
import Foundation
#if canImport(FoundationNetworking)
import FoundationNetworking
#endif
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
public final class PageFetcher {
private let webDriverURL: URL
private let session = URLSession.shared
// MARK: - Initialization
/// Initialize PageFetcher with WebDriver URL
/// - Parameter webDriverURL: The URL where Selenium WebDriver is running (default: http://localhost:4444)
public init(webDriverURL: String) {
self.webDriverURL = URL(string: webDriverURL)!
}
// MARK: - Public Methods
/// Fetch the raw HTML content from the specified URL.
///
/// - Parameter url: The URL to fetch content from
/// - Returns: The raw HTML content as a string
/// - Throws: PageFetcherError for various failure scenarios
public func fetchHTML(from url: String) async throws -> String {
let sessionId = try await startSession()
try await navigateToURL(url, sessionId: sessionId)
let source = try await getPageSource(sessionId: sessionId)
try await endSession(sessionId: sessionId)
return source
}
// MARK: - Private Methods
/// Start a new WebDriver session with Chrome capabilities.
private func startSession() async throws -> String {
let capabilities = [
"capabilities": [
"alwaysMatch": [
"browserName": "chrome",
"goog:chromeOptions": [
"args": [
"--headless",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--window-size=1920,1080",
],
],
],
],
]
let (data, _) = try await makeRequest(verb: "POST", path: "wd/hub/session", body: capabilities)
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
let value = json["value"] as? [String: Any],
let sessionId = value["sessionId"] as? String
else {
throw PageFetcherError.invalidSessionResponse
}
return sessionId
}
private func navigateToURL(_ url: String, sessionId: String) async throws {
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
}
private func getPageSource(sessionId: String) async throws -> String {
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
let value = json["value"] as? String
else {
throw PageFetcherError.invalidPageSourceResponse
}
return value
}
private func endSession(sessionId: String) async throws {
_ = try await makeRequest(verb: "DELETE", path: "", sessionId: sessionId)
}
private func makeRequest(verb: String, path: String,
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
{
var request = URLRequest(url: webDriverURL.appendingPathComponent(path))
print("\(verb) \(request.url?.absoluteString ?? "")")
request.httpMethod = verb
if verb == "POST" {
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.httpBody = try JSONSerialization.data(withJSONObject: body as Any)
}
let (data, response) = try await session.data(for: request)
guard let httpResponse = response as? HTTPURLResponse else {
throw PageFetcherError.invalidSessionResponse
}
guard httpResponse.statusCode == 200 else {
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
[String: Any]
let value = maybeJSON?["value"] as? [String: Any]
let message = value?["message"] as? String ?? ""
throw PageFetcherError.seleniumError(httpResponse.statusCode, message)
}
return (data, httpResponse)
}
private func makeRequest(verb: String, path: String, sessionId: String,
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
{
let addendum = [sessionId, path].filter { !$0.isEmpty }.joined(separator: "/")
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
}
}
// MARK: - Errors
// MARK: - Usage Example
public extension PageFetcher {
/// Convenience method to fetch GoComics A-Z page
func fetchAToZ() async throws -> String {
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
}
}

View File

@ -0,0 +1,28 @@
import Foundation
/// Describes any errors that can happen with the PageFetcher.
public enum PageFetcherError: Error, LocalizedError {
case sessionCreationFailed
case invalidSessionResponse
case noActiveSession
case seleniumError(Int, String)
case requestFailed
case invalidPageSourceResponse
public var errorDescription: String? {
switch self {
case .sessionCreationFailed:
"Failed to create WebDriver session"
case .invalidSessionResponse:
"Invalid response when creating WebDriver session"
case .noActiveSession:
"No active WebDriver session"
case let .seleniumError(code, message):
"Selenium error (HTTP \(code)): \(message)"
case .requestFailed:
"Failed to make request"
case .invalidPageSourceResponse:
"Invalid response when getting page source"
}
}
}