Add initial fetching code using Selenium
This commit is contained in:
parent
92d7872439
commit
fd25540633
27
.devcontainer/devcontainer.json
Normal file
27
.devcontainer/devcontainer.json
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"name": "Putty",
|
||||||
|
"dockerComposeFile": "docker-compose.yaml",
|
||||||
|
"features": {
|
||||||
|
"ghcr.io/devcontainers/features/common-utils:2": {
|
||||||
|
"installZsh": "true",
|
||||||
|
"username": "user",
|
||||||
|
"upgradePackages": "false"
|
||||||
|
},
|
||||||
|
"ghcr.io/devcontainers/features/git:1": {
|
||||||
|
"version": "os-provided",
|
||||||
|
"ppa": "false"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"runArgs": [
|
||||||
|
"--cap-add=SYS_PTRACE",
|
||||||
|
"--security-opt",
|
||||||
|
"seccomp=unconfined"
|
||||||
|
],
|
||||||
|
"service": "workspace",
|
||||||
|
"workspaceFolder": "/workspace",
|
||||||
|
"forwardPorts": [
|
||||||
|
8080
|
||||||
|
],
|
||||||
|
"postCreateCommand": "sh ./.devcontainer/postCreateCommand.sh",
|
||||||
|
"remoteUser": "user"
|
||||||
|
}
|
||||||
14
.devcontainer/docker-compose.yaml
Normal file
14
.devcontainer/docker-compose.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
services:
|
||||||
|
workspace:
|
||||||
|
image: swift:6.1
|
||||||
|
command: sleep infinity
|
||||||
|
depends_on: [browser]
|
||||||
|
volumes: [..:/workspace:cached, build-tmp:/workspace/.build/]
|
||||||
|
env_file: ../.env
|
||||||
|
browser:
|
||||||
|
image: selenium/standalone-chromium:latest
|
||||||
|
shm_size: 2gb
|
||||||
|
ports: [4444:4444, 7900:7900]
|
||||||
|
volumes:
|
||||||
|
build-tmp: {}
|
||||||
5
.devcontainer/postCreateCommand.sh
Normal file
5
.devcontainer/postCreateCommand.sh
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env sh
|
||||||
|
|
||||||
|
# postCreateCommand.sh
|
||||||
|
sudo chown user:user -R /workspace/.build
|
||||||
|
# TODO: Install swiftformat
|
||||||
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@ -0,0 +1 @@
|
|||||||
|
.build
|
||||||
@ -1 +1 @@
|
|||||||
6.2
|
6.1
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
// swift-tools-version: 6.2
|
// swift-tools-version: 6.1
|
||||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
@ -6,14 +6,20 @@ import PackageDescription
|
|||||||
let package = Package(
|
let package = Package(
|
||||||
name: "Putty",
|
name: "Putty",
|
||||||
platforms: [.macOS(.v15)],
|
platforms: [.macOS(.v15)],
|
||||||
|
products: [
|
||||||
|
.library(name: "PuttyKit", targets: ["PuttyKit"]),
|
||||||
|
],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(name: "putty", dependencies: [
|
.target(name: "PuttyKit"),
|
||||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
.executableTarget(name: "putty",
|
||||||
]),
|
dependencies: [
|
||||||
],
|
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||||
)
|
.target(name: "PuttyKit"),
|
||||||
|
],
|
||||||
|
path: "Sources/CLI"),
|
||||||
|
])
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
import ArgumentParser
|
import ArgumentParser
|
||||||
|
import PuttyKit
|
||||||
|
|
||||||
@main
|
@main
|
||||||
struct CLI: AsyncParsableCommand {
|
struct CLI: AsyncParsableCommand {
|
||||||
static let configuration: CommandConfiguration = .init(
|
static let configuration: CommandConfiguration = .init(
|
||||||
commandName: "putty",
|
commandName: "putty",
|
||||||
abstract: "A utility for getting comic data from GoComics.com.",
|
abstract: "A utility for getting comic data from GoComics.com.",
|
||||||
subcommands: [Scrape.self],
|
subcommands: [Scrape.self])
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Scrape: AsyncParsableCommand {
|
struct Scrape: AsyncParsableCommand {
|
||||||
@ -14,5 +14,8 @@ struct Scrape: AsyncParsableCommand {
|
|||||||
|
|
||||||
mutating func run() async throws {
|
mutating func run() async throws {
|
||||||
print("scrape")
|
print("scrape")
|
||||||
|
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
|
||||||
|
_ = try await fetcher.fetchAToZ()
|
||||||
|
print("Fetch complete!")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
125
Sources/PuttyKit/Fetchers/PageFetcher.swift
Normal file
125
Sources/PuttyKit/Fetchers/PageFetcher.swift
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import Foundation
|
||||||
|
#if canImport(FoundationNetworking)
|
||||||
|
import FoundationNetworking
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
|
||||||
|
public final class PageFetcher {
|
||||||
|
private let webDriverURL: URL
|
||||||
|
private let session = URLSession.shared
|
||||||
|
|
||||||
|
// MARK: - Initialization
|
||||||
|
|
||||||
|
/// Initialize PageFetcher with WebDriver URL
|
||||||
|
/// - Parameter webDriverURL: The URL where Selenium WebDriver is running (default: http://localhost:4444)
|
||||||
|
public init(webDriverURL: String) {
|
||||||
|
self.webDriverURL = URL(string: webDriverURL)!
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Public Methods
|
||||||
|
|
||||||
|
/// Fetch the raw HTML content from the specified URL.
|
||||||
|
///
|
||||||
|
/// - Parameter url: The URL to fetch content from
|
||||||
|
/// - Returns: The raw HTML content as a string
|
||||||
|
/// - Throws: PageFetcherError for various failure scenarios
|
||||||
|
public func fetchHTML(from url: String) async throws -> String {
|
||||||
|
let sessionId = try await startSession()
|
||||||
|
try await navigateToURL(url, sessionId: sessionId)
|
||||||
|
let source = try await getPageSource(sessionId: sessionId)
|
||||||
|
try await endSession(sessionId: sessionId)
|
||||||
|
return source
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Private Methods
|
||||||
|
|
||||||
|
/// Start a new WebDriver session with Chrome capabilities.
|
||||||
|
private func startSession() async throws -> String {
|
||||||
|
let capabilities = [
|
||||||
|
"capabilities": [
|
||||||
|
"alwaysMatch": [
|
||||||
|
"browserName": "chrome",
|
||||||
|
"goog:chromeOptions": [
|
||||||
|
"args": [
|
||||||
|
"--headless",
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--window-size=1920,1080",
|
||||||
|
],
|
||||||
|
],
|
||||||
|
],
|
||||||
|
],
|
||||||
|
]
|
||||||
|
|
||||||
|
let (data, _) = try await makeRequest(verb: "POST", path: "wd/hub/session", body: capabilities)
|
||||||
|
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||||
|
let value = json["value"] as? [String: Any],
|
||||||
|
let sessionId = value["sessionId"] as? String
|
||||||
|
else {
|
||||||
|
throw PageFetcherError.invalidSessionResponse
|
||||||
|
}
|
||||||
|
return sessionId
|
||||||
|
}
|
||||||
|
|
||||||
|
private func navigateToURL(_ url: String, sessionId: String) async throws {
|
||||||
|
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
|
||||||
|
}
|
||||||
|
|
||||||
|
private func getPageSource(sessionId: String) async throws -> String {
|
||||||
|
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
|
||||||
|
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||||
|
let value = json["value"] as? String
|
||||||
|
else {
|
||||||
|
throw PageFetcherError.invalidPageSourceResponse
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
private func endSession(sessionId: String) async throws {
|
||||||
|
_ = try await makeRequest(verb: "DELETE", path: "", sessionId: sessionId)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func makeRequest(verb: String, path: String,
|
||||||
|
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
|
||||||
|
{
|
||||||
|
var request = URLRequest(url: webDriverURL.appendingPathComponent(path))
|
||||||
|
print("\(verb) \(request.url?.absoluteString ?? "")")
|
||||||
|
request.httpMethod = verb
|
||||||
|
if verb == "POST" {
|
||||||
|
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||||
|
request.httpBody = try JSONSerialization.data(withJSONObject: body as Any)
|
||||||
|
}
|
||||||
|
let (data, response) = try await session.data(for: request)
|
||||||
|
guard let httpResponse = response as? HTTPURLResponse else {
|
||||||
|
throw PageFetcherError.invalidSessionResponse
|
||||||
|
}
|
||||||
|
guard httpResponse.statusCode == 200 else {
|
||||||
|
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
|
||||||
|
[String: Any]
|
||||||
|
let value = maybeJSON?["value"] as? [String: Any]
|
||||||
|
let message = value?["message"] as? String ?? ""
|
||||||
|
throw PageFetcherError.seleniumError(httpResponse.statusCode, message)
|
||||||
|
}
|
||||||
|
return (data, httpResponse)
|
||||||
|
}
|
||||||
|
|
||||||
|
private func makeRequest(verb: String, path: String, sessionId: String,
|
||||||
|
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
|
||||||
|
{
|
||||||
|
let addendum = [sessionId, path].filter { !$0.isEmpty }.joined(separator: "/")
|
||||||
|
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Errors
|
||||||
|
|
||||||
|
// MARK: - Usage Example
|
||||||
|
|
||||||
|
public extension PageFetcher {
|
||||||
|
/// Convenience method to fetch GoComics A-Z page
|
||||||
|
func fetchAToZ() async throws -> String {
|
||||||
|
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
|
||||||
|
}
|
||||||
|
}
|
||||||
28
Sources/PuttyKit/Fetchers/PageFetcherError.swift
Normal file
28
Sources/PuttyKit/Fetchers/PageFetcherError.swift
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Describes any errors that can happen with the PageFetcher.
|
||||||
|
public enum PageFetcherError: Error, LocalizedError {
|
||||||
|
case sessionCreationFailed
|
||||||
|
case invalidSessionResponse
|
||||||
|
case noActiveSession
|
||||||
|
case seleniumError(Int, String)
|
||||||
|
case requestFailed
|
||||||
|
case invalidPageSourceResponse
|
||||||
|
|
||||||
|
public var errorDescription: String? {
|
||||||
|
switch self {
|
||||||
|
case .sessionCreationFailed:
|
||||||
|
"Failed to create WebDriver session"
|
||||||
|
case .invalidSessionResponse:
|
||||||
|
"Invalid response when creating WebDriver session"
|
||||||
|
case .noActiveSession:
|
||||||
|
"No active WebDriver session"
|
||||||
|
case let .seleniumError(code, message):
|
||||||
|
"Selenium error (HTTP \(code)): \(message)"
|
||||||
|
case .requestFailed:
|
||||||
|
"Failed to make request"
|
||||||
|
case .invalidPageSourceResponse:
|
||||||
|
"Invalid response when getting page source"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user