Add initial fetching code using Selenium
This commit is contained in:
parent
92d7872439
commit
fd25540633
27
.devcontainer/devcontainer.json
Normal file
27
.devcontainer/devcontainer.json
Normal file
@ -0,0 +1,27 @@
|
||||
{
|
||||
"name": "Putty",
|
||||
"dockerComposeFile": "docker-compose.yaml",
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/common-utils:2": {
|
||||
"installZsh": "true",
|
||||
"username": "user",
|
||||
"upgradePackages": "false"
|
||||
},
|
||||
"ghcr.io/devcontainers/features/git:1": {
|
||||
"version": "os-provided",
|
||||
"ppa": "false"
|
||||
}
|
||||
},
|
||||
"runArgs": [
|
||||
"--cap-add=SYS_PTRACE",
|
||||
"--security-opt",
|
||||
"seccomp=unconfined"
|
||||
],
|
||||
"service": "workspace",
|
||||
"workspaceFolder": "/workspace",
|
||||
"forwardPorts": [
|
||||
8080
|
||||
],
|
||||
"postCreateCommand": "sh ./.devcontainer/postCreateCommand.sh",
|
||||
"remoteUser": "user"
|
||||
}
|
||||
14
.devcontainer/docker-compose.yaml
Normal file
14
.devcontainer/docker-compose.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
---
|
||||
services:
|
||||
workspace:
|
||||
image: swift:6.1
|
||||
command: sleep infinity
|
||||
depends_on: [browser]
|
||||
volumes: [..:/workspace:cached, build-tmp:/workspace/.build/]
|
||||
env_file: ../.env
|
||||
browser:
|
||||
image: selenium/standalone-chromium:latest
|
||||
shm_size: 2gb
|
||||
ports: [4444:4444, 7900:7900]
|
||||
volumes:
|
||||
build-tmp: {}
|
||||
5
.devcontainer/postCreateCommand.sh
Normal file
5
.devcontainer/postCreateCommand.sh
Normal file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
# postCreateCommand.sh
|
||||
sudo chown user:user -R /workspace/.build
|
||||
# TODO: Install swiftformat
|
||||
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@ -0,0 +1 @@
|
||||
.build
|
||||
@ -1 +1 @@
|
||||
6.2
|
||||
6.1
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// swift-tools-version: 6.2
|
||||
// swift-tools-version: 6.1
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
@ -6,14 +6,20 @@ import PackageDescription
|
||||
let package = Package(
|
||||
name: "Putty",
|
||||
platforms: [.macOS(.v15)],
|
||||
products: [
|
||||
.library(name: "PuttyKit", targets: ["PuttyKit"]),
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
||||
],
|
||||
targets: [
|
||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||
// Targets can depend on other targets in this package and products from dependencies.
|
||||
.executableTarget(name: "putty", dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
.target(name: "PuttyKit"),
|
||||
.executableTarget(name: "putty",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
.target(name: "PuttyKit"),
|
||||
],
|
||||
path: "Sources/CLI"),
|
||||
])
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
import ArgumentParser
|
||||
import PuttyKit
|
||||
|
||||
@main
|
||||
struct CLI: AsyncParsableCommand {
|
||||
static let configuration: CommandConfiguration = .init(
|
||||
commandName: "putty",
|
||||
abstract: "A utility for getting comic data from GoComics.com.",
|
||||
subcommands: [Scrape.self],
|
||||
)
|
||||
subcommands: [Scrape.self])
|
||||
}
|
||||
|
||||
struct Scrape: AsyncParsableCommand {
|
||||
@ -14,5 +14,8 @@ struct Scrape: AsyncParsableCommand {
|
||||
|
||||
mutating func run() async throws {
|
||||
print("scrape")
|
||||
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
|
||||
_ = try await fetcher.fetchAToZ()
|
||||
print("Fetch complete!")
|
||||
}
|
||||
}
|
||||
125
Sources/PuttyKit/Fetchers/PageFetcher.swift
Normal file
125
Sources/PuttyKit/Fetchers/PageFetcher.swift
Normal file
@ -0,0 +1,125 @@
|
||||
import Foundation
|
||||
#if canImport(FoundationNetworking)
|
||||
import FoundationNetworking
|
||||
#endif
|
||||
|
||||
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
|
||||
public final class PageFetcher {
|
||||
private let webDriverURL: URL
|
||||
private let session = URLSession.shared
|
||||
|
||||
// MARK: - Initialization
|
||||
|
||||
/// Initialize PageFetcher with WebDriver URL
|
||||
/// - Parameter webDriverURL: The URL where Selenium WebDriver is running (default: http://localhost:4444)
|
||||
public init(webDriverURL: String) {
|
||||
self.webDriverURL = URL(string: webDriverURL)!
|
||||
}
|
||||
|
||||
// MARK: - Public Methods
|
||||
|
||||
/// Fetch the raw HTML content from the specified URL.
|
||||
///
|
||||
/// - Parameter url: The URL to fetch content from
|
||||
/// - Returns: The raw HTML content as a string
|
||||
/// - Throws: PageFetcherError for various failure scenarios
|
||||
public func fetchHTML(from url: String) async throws -> String {
|
||||
let sessionId = try await startSession()
|
||||
try await navigateToURL(url, sessionId: sessionId)
|
||||
let source = try await getPageSource(sessionId: sessionId)
|
||||
try await endSession(sessionId: sessionId)
|
||||
return source
|
||||
}
|
||||
|
||||
// MARK: - Private Methods
|
||||
|
||||
/// Start a new WebDriver session with Chrome capabilities.
|
||||
private func startSession() async throws -> String {
|
||||
let capabilities = [
|
||||
"capabilities": [
|
||||
"alwaysMatch": [
|
||||
"browserName": "chrome",
|
||||
"goog:chromeOptions": [
|
||||
"args": [
|
||||
"--headless",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
"--window-size=1920,1080",
|
||||
],
|
||||
],
|
||||
],
|
||||
],
|
||||
]
|
||||
|
||||
let (data, _) = try await makeRequest(verb: "POST", path: "wd/hub/session", body: capabilities)
|
||||
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let value = json["value"] as? [String: Any],
|
||||
let sessionId = value["sessionId"] as? String
|
||||
else {
|
||||
throw PageFetcherError.invalidSessionResponse
|
||||
}
|
||||
return sessionId
|
||||
}
|
||||
|
||||
private func navigateToURL(_ url: String, sessionId: String) async throws {
|
||||
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
|
||||
}
|
||||
|
||||
private func getPageSource(sessionId: String) async throws -> String {
|
||||
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
|
||||
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let value = json["value"] as? String
|
||||
else {
|
||||
throw PageFetcherError.invalidPageSourceResponse
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
private func endSession(sessionId: String) async throws {
|
||||
_ = try await makeRequest(verb: "DELETE", path: "", sessionId: sessionId)
|
||||
}
|
||||
|
||||
private func makeRequest(verb: String, path: String,
|
||||
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
|
||||
{
|
||||
var request = URLRequest(url: webDriverURL.appendingPathComponent(path))
|
||||
print("\(verb) \(request.url?.absoluteString ?? "")")
|
||||
request.httpMethod = verb
|
||||
if verb == "POST" {
|
||||
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
request.httpBody = try JSONSerialization.data(withJSONObject: body as Any)
|
||||
}
|
||||
let (data, response) = try await session.data(for: request)
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw PageFetcherError.invalidSessionResponse
|
||||
}
|
||||
guard httpResponse.statusCode == 200 else {
|
||||
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
|
||||
[String: Any]
|
||||
let value = maybeJSON?["value"] as? [String: Any]
|
||||
let message = value?["message"] as? String ?? ""
|
||||
throw PageFetcherError.seleniumError(httpResponse.statusCode, message)
|
||||
}
|
||||
return (data, httpResponse)
|
||||
}
|
||||
|
||||
private func makeRequest(verb: String, path: String, sessionId: String,
|
||||
body: [String: Any]? = nil) async throws -> (Data, HTTPURLResponse)
|
||||
{
|
||||
let addendum = [sessionId, path].filter { !$0.isEmpty }.joined(separator: "/")
|
||||
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Errors
|
||||
|
||||
// MARK: - Usage Example
|
||||
|
||||
public extension PageFetcher {
|
||||
/// Convenience method to fetch GoComics A-Z page
|
||||
func fetchAToZ() async throws -> String {
|
||||
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
|
||||
}
|
||||
}
|
||||
28
Sources/PuttyKit/Fetchers/PageFetcherError.swift
Normal file
28
Sources/PuttyKit/Fetchers/PageFetcherError.swift
Normal file
@ -0,0 +1,28 @@
|
||||
import Foundation
|
||||
|
||||
/// Describes any errors that can happen with the PageFetcher.
|
||||
public enum PageFetcherError: Error, LocalizedError {
|
||||
case sessionCreationFailed
|
||||
case invalidSessionResponse
|
||||
case noActiveSession
|
||||
case seleniumError(Int, String)
|
||||
case requestFailed
|
||||
case invalidPageSourceResponse
|
||||
|
||||
public var errorDescription: String? {
|
||||
switch self {
|
||||
case .sessionCreationFailed:
|
||||
"Failed to create WebDriver session"
|
||||
case .invalidSessionResponse:
|
||||
"Invalid response when creating WebDriver session"
|
||||
case .noActiveSession:
|
||||
"No active WebDriver session"
|
||||
case let .seleniumError(code, message):
|
||||
"Selenium error (HTTP \(code)): \(message)"
|
||||
case .requestFailed:
|
||||
"Failed to make request"
|
||||
case .invalidPageSourceResponse:
|
||||
"Invalid response when getting page source"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user