Fill out AToZ parser to my liking

This commit is contained in:
Ben Kreeger 2025-08-25 23:27:05 -05:00
parent db9127e891
commit f05aa1161c
Signed by: kreeger
GPG Key ID: D5CF8683D4BE4B50
7 changed files with 108 additions and 17 deletions

View File

@ -1,5 +1,5 @@
{
"originHash" : "7168671e688b0ccae86155da01e80e0d8c44c91c80a5989a6c075f4f2ae64b75",
"originHash" : "26a62968da6c130d62ba93c66650932d484e42350b23f874d07bd9961277a3b9",
"pins" : [
{
"identity" : "swift-argument-parser",
@ -9,6 +9,15 @@
"revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3",
"version" : "1.6.1"
}
},
{
"identity" : "swiftsoup",
"kind" : "remoteSourceControl",
"location" : "https://github.com/scinfu/SwiftSoup",
"state" : {
"revision" : "3a439f9eccc391b264d54516ce640251552eb0c4",
"version" : "2.10.3"
}
}
],
"version" : 3

View File

@ -11,15 +11,18 @@ let package = Package(
],
dependencies: [
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
.package(url: "https://github.com/scinfu/SwiftSoup", from: "2.6.0"),
],
targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.target(name: "PuttyKit"),
.target(name: "PuttyKit",
dependencies: ["SwiftSoup"]),
.executableTarget(name: "putty",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
.target(name: "PuttyKit"),
],
path: "Sources/CLI"),
])
],
)

View File

@ -11,12 +11,14 @@ struct CLI: AsyncParsableCommand {
}
struct Scrape: AsyncParsableCommand {
static let configuration: CommandConfiguration = .init(abstract: "Scrape all necessary data.")
static let configuration = CommandConfiguration(abstract: "Scrape all necessary data.")
mutating func run() async throws {
print("scrape")
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
_ = try await fetcher.fetchAToZ()
print("Fetch complete!")
let content = try await fetcher.fetchHTML(from: "https://www.gocomics.com/comics/a-to-z",
waitFor: AToZParser.baseSelector)
let parsed = try AToZParser.parse(content: content)
print("Fetch complete: \(parsed.count) features fetched.")
print("\(parsed.count(where: \.isUpdated)) features updated today.")
}
}

View File

@ -4,7 +4,7 @@ import Foundation
#endif
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
public final class PageFetcher {
public struct PageFetcher {
private let webDriverURL: URL
private let session = URLSession.shared
@ -23,9 +23,12 @@ public final class PageFetcher {
/// - Parameter url: The URL to fetch content from
/// - Returns: The raw HTML content as a string
/// - Throws: PageFetcherError for various failure scenarios
public func fetchHTML(from url: String) async throws -> String {
public func fetchHTML(from url: String, waitFor element: String?) async throws -> String {
let sessionId = try await startSession()
try await navigateToURL(url, sessionId: sessionId)
if let element {
try await waitUntilElement(sessionId: sessionId, selector: element, attempts: 5)
}
let source = try await getPageSource(sessionId: sessionId)
try await endSession(sessionId: sessionId)
return source
@ -66,6 +69,26 @@ public final class PageFetcher {
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
}
private func findElement(sessionId: String, selector: String) async throws -> Bool {
do {
_ = try await makeRequest(verb: "POST", path: "element", sessionId: sessionId, body: ["using": "css selector", "value": selector])
return true
} catch {
return false
}
}
private func waitUntilElement(sessionId: String, selector: String, attempts: Int) async throws {
for _ in 0 ..< attempts {
let success = try await findElement(sessionId: sessionId, selector: selector)
if success {
return
} else {
try await Task.sleep(nanoseconds: 1 * 1_000_000_000)
}
}
}
private func getPageSource(sessionId: String) async throws -> String {
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
@ -95,7 +118,7 @@ public final class PageFetcher {
guard let httpResponse = response as? HTTPURLResponse else {
throw PageFetcherError.invalidSessionResponse
}
guard httpResponse.statusCode == 200 else {
guard httpResponse.statusCode < 400 else {
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
[String: Any]
let value = maybeJSON?["value"] as? [String: Any]
@ -112,10 +135,3 @@ public final class PageFetcher {
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
}
}
public extension PageFetcher {
/// Convenience method to fetch GoComics A-Z page
func fetchAToZ() async throws -> String {
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
}
}

View File

@ -0,0 +1,33 @@
import Foundation
import SwiftSoup
public enum AToZParser {
public static let baseSelector = "section.FilterComics_filterComics__9VGXB"
static let cardSelector = "a[data-analytics-type=\"feature_card\"]"
static let titleSelector = "h3"
static let bylineSelector = "h4"
static let imageSelector = "img"
static let updatedSelector = "span"
public static func parse(content: String) throws -> [Feature] {
let document = try SwiftSoup.parse(content)
let content = try element(from: document, selector: baseSelector)
let links = try content.select(cardSelector)
let features = try links.compactMap { card in
let identifier = try card.attr("href")
let title = try element(from: card, selector: titleSelector).text()
let byline = try element(from: card, selector: bylineSelector).text()
let image = try card.select(imageSelector).first().map { try $0.attr("src") }.flatMap { URL(string: $0) }
let span = try card.select(updatedSelector).first()
return Feature(identifier: identifier, title: title, byline: byline, imageURL: image, isUpdated: span != nil)
}
return features
}
static func element(from document: SwiftSoup.Element, selector: String) throws -> SwiftSoup.Element {
guard let result = try document.select(selector).first() else {
throw ParserError.missingElement(selector)
}
return result
}
}

View File

@ -0,0 +1,9 @@
import Foundation
public struct Feature {
public let identifier: String
public let title: String
public let byline: String
public let imageURL: URL?
public let isUpdated: Bool
}

View File

@ -0,0 +1,19 @@
import Foundation
/// Describes any errors that can happen with a Parser.
public enum ParserError: Error, LocalizedError {
case devError(String)
case encodingError
case missingElement(String)
public var errorDescription: String? {
switch self {
case let .devError(string):
"devError: \(string)"
case .encodingError:
"Error encoding content to UTF-8"
case let .missingElement(element):
"Failed to find element: \(element)"
}
}
}