From f05aa1161c3ecb2f886eb4097c37806d661c9817 Mon Sep 17 00:00:00 2001 From: Ben Kreeger Date: Mon, 25 Aug 2025 23:27:05 -0500 Subject: [PATCH] Fill out AToZ parser to my liking --- Package.resolved | 11 +++++- Package.swift | 7 ++-- Sources/CLI/CLI.swift | 10 +++--- Sources/PuttyKit/Fetchers/PageFetcher.swift | 36 +++++++++++++------ Sources/PuttyKit/Parsers/AToZ.swift | 33 +++++++++++++++++ Sources/PuttyKit/Parsers/Models/Feature.swift | 9 +++++ Sources/PuttyKit/Parsers/ParserError.swift | 19 ++++++++++ 7 files changed, 108 insertions(+), 17 deletions(-) create mode 100644 Sources/PuttyKit/Parsers/AToZ.swift create mode 100644 Sources/PuttyKit/Parsers/Models/Feature.swift create mode 100644 Sources/PuttyKit/Parsers/ParserError.swift diff --git a/Package.resolved b/Package.resolved index b60c05a..b1719af 100644 --- a/Package.resolved +++ b/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "7168671e688b0ccae86155da01e80e0d8c44c91c80a5989a6c075f4f2ae64b75", + "originHash" : "26a62968da6c130d62ba93c66650932d484e42350b23f874d07bd9961277a3b9", "pins" : [ { "identity" : "swift-argument-parser", @@ -9,6 +9,15 @@ "revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3", "version" : "1.6.1" } + }, + { + "identity" : "swiftsoup", + "kind" : "remoteSourceControl", + "location" : "https://github.com/scinfu/SwiftSoup", + "state" : { + "revision" : "3a439f9eccc391b264d54516ce640251552eb0c4", + "version" : "2.10.3" + } } ], "version" : 3 diff --git a/Package.swift b/Package.swift index e2952d6..8112365 100644 --- a/Package.swift +++ b/Package.swift @@ -11,15 +11,18 @@ let package = Package( ], dependencies: [ .package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"), + .package(url: "https://github.com/scinfu/SwiftSoup", from: "2.6.0"), ], targets: [ // Targets are the basic building blocks of a package, defining a module or a test suite. // Targets can depend on other targets in this package and products from dependencies. - .target(name: "PuttyKit"), + .target(name: "PuttyKit", + dependencies: ["SwiftSoup"]), .executableTarget(name: "putty", dependencies: [ .product(name: "ArgumentParser", package: "swift-argument-parser"), .target(name: "PuttyKit"), ], path: "Sources/CLI"), - ]) + ], +) diff --git a/Sources/CLI/CLI.swift b/Sources/CLI/CLI.swift index 9cdfcf1..65aba7b 100644 --- a/Sources/CLI/CLI.swift +++ b/Sources/CLI/CLI.swift @@ -11,12 +11,14 @@ struct CLI: AsyncParsableCommand { } struct Scrape: AsyncParsableCommand { - static let configuration: CommandConfiguration = .init(abstract: "Scrape all necessary data.") + static let configuration = CommandConfiguration(abstract: "Scrape all necessary data.") mutating func run() async throws { - print("scrape") let fetcher = PageFetcher(webDriverURL: "http://browser:4444") - _ = try await fetcher.fetchAToZ() - print("Fetch complete!") + let content = try await fetcher.fetchHTML(from: "https://www.gocomics.com/comics/a-to-z", + waitFor: AToZParser.baseSelector) + let parsed = try AToZParser.parse(content: content) + print("Fetch complete: \(parsed.count) features fetched.") + print("\(parsed.count(where: \.isUpdated)) features updated today.") } } diff --git a/Sources/PuttyKit/Fetchers/PageFetcher.swift b/Sources/PuttyKit/Fetchers/PageFetcher.swift index 74b8116..fbb34fa 100644 --- a/Sources/PuttyKit/Fetchers/PageFetcher.swift +++ b/Sources/PuttyKit/Fetchers/PageFetcher.swift @@ -4,7 +4,7 @@ import Foundation #endif /// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content -public final class PageFetcher { +public struct PageFetcher { private let webDriverURL: URL private let session = URLSession.shared @@ -23,9 +23,12 @@ public final class PageFetcher { /// - Parameter url: The URL to fetch content from /// - Returns: The raw HTML content as a string /// - Throws: PageFetcherError for various failure scenarios - public func fetchHTML(from url: String) async throws -> String { + public func fetchHTML(from url: String, waitFor element: String?) async throws -> String { let sessionId = try await startSession() try await navigateToURL(url, sessionId: sessionId) + if let element { + try await waitUntilElement(sessionId: sessionId, selector: element, attempts: 5) + } let source = try await getPageSource(sessionId: sessionId) try await endSession(sessionId: sessionId) return source @@ -66,6 +69,26 @@ public final class PageFetcher { _ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url]) } + private func findElement(sessionId: String, selector: String) async throws -> Bool { + do { + _ = try await makeRequest(verb: "POST", path: "element", sessionId: sessionId, body: ["using": "css selector", "value": selector]) + return true + } catch { + return false + } + } + + private func waitUntilElement(sessionId: String, selector: String, attempts: Int) async throws { + for _ in 0 ..< attempts { + let success = try await findElement(sessionId: sessionId, selector: selector) + if success { + return + } else { + try await Task.sleep(nanoseconds: 1 * 1_000_000_000) + } + } + } + private func getPageSource(sessionId: String) async throws -> String { let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId) guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any], @@ -95,7 +118,7 @@ public final class PageFetcher { guard let httpResponse = response as? HTTPURLResponse else { throw PageFetcherError.invalidSessionResponse } - guard httpResponse.statusCode == 200 else { + guard httpResponse.statusCode < 400 else { let maybeJSON = try? JSONSerialization.jsonObject(with: data) as? [String: Any] let value = maybeJSON?["value"] as? [String: Any] @@ -112,10 +135,3 @@ public final class PageFetcher { return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body) } } - -public extension PageFetcher { - /// Convenience method to fetch GoComics A-Z page - func fetchAToZ() async throws -> String { - try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z") - } -} diff --git a/Sources/PuttyKit/Parsers/AToZ.swift b/Sources/PuttyKit/Parsers/AToZ.swift new file mode 100644 index 0000000..033342a --- /dev/null +++ b/Sources/PuttyKit/Parsers/AToZ.swift @@ -0,0 +1,33 @@ +import Foundation +import SwiftSoup + +public enum AToZParser { + public static let baseSelector = "section.FilterComics_filterComics__9VGXB" + static let cardSelector = "a[data-analytics-type=\"feature_card\"]" + static let titleSelector = "h3" + static let bylineSelector = "h4" + static let imageSelector = "img" + static let updatedSelector = "span" + + public static func parse(content: String) throws -> [Feature] { + let document = try SwiftSoup.parse(content) + let content = try element(from: document, selector: baseSelector) + let links = try content.select(cardSelector) + let features = try links.compactMap { card in + let identifier = try card.attr("href") + let title = try element(from: card, selector: titleSelector).text() + let byline = try element(from: card, selector: bylineSelector).text() + let image = try card.select(imageSelector).first().map { try $0.attr("src") }.flatMap { URL(string: $0) } + let span = try card.select(updatedSelector).first() + return Feature(identifier: identifier, title: title, byline: byline, imageURL: image, isUpdated: span != nil) + } + return features + } + + static func element(from document: SwiftSoup.Element, selector: String) throws -> SwiftSoup.Element { + guard let result = try document.select(selector).first() else { + throw ParserError.missingElement(selector) + } + return result + } +} diff --git a/Sources/PuttyKit/Parsers/Models/Feature.swift b/Sources/PuttyKit/Parsers/Models/Feature.swift new file mode 100644 index 0000000..200e26e --- /dev/null +++ b/Sources/PuttyKit/Parsers/Models/Feature.swift @@ -0,0 +1,9 @@ +import Foundation + +public struct Feature { + public let identifier: String + public let title: String + public let byline: String + public let imageURL: URL? + public let isUpdated: Bool +} diff --git a/Sources/PuttyKit/Parsers/ParserError.swift b/Sources/PuttyKit/Parsers/ParserError.swift new file mode 100644 index 0000000..1d70ccb --- /dev/null +++ b/Sources/PuttyKit/Parsers/ParserError.swift @@ -0,0 +1,19 @@ +import Foundation + +/// Describes any errors that can happen with a Parser. +public enum ParserError: Error, LocalizedError { + case devError(String) + case encodingError + case missingElement(String) + + public var errorDescription: String? { + switch self { + case let .devError(string): + "devError: \(string)" + case .encodingError: + "Error encoding content to UTF-8" + case let .missingElement(element): + "Failed to find element: \(element)" + } + } +}