Fill out AToZ parser to my liking
This commit is contained in:
parent
db9127e891
commit
f05aa1161c
@ -1,5 +1,5 @@
|
||||
{
|
||||
"originHash" : "7168671e688b0ccae86155da01e80e0d8c44c91c80a5989a6c075f4f2ae64b75",
|
||||
"originHash" : "26a62968da6c130d62ba93c66650932d484e42350b23f874d07bd9961277a3b9",
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "swift-argument-parser",
|
||||
@ -9,6 +9,15 @@
|
||||
"revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3",
|
||||
"version" : "1.6.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swiftsoup",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/scinfu/SwiftSoup",
|
||||
"state" : {
|
||||
"revision" : "3a439f9eccc391b264d54516ce640251552eb0c4",
|
||||
"version" : "2.10.3"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 3
|
||||
|
||||
@ -11,15 +11,18 @@ let package = Package(
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
||||
.package(url: "https://github.com/scinfu/SwiftSoup", from: "2.6.0"),
|
||||
],
|
||||
targets: [
|
||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||
// Targets can depend on other targets in this package and products from dependencies.
|
||||
.target(name: "PuttyKit"),
|
||||
.target(name: "PuttyKit",
|
||||
dependencies: ["SwiftSoup"]),
|
||||
.executableTarget(name: "putty",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
.target(name: "PuttyKit"),
|
||||
],
|
||||
path: "Sources/CLI"),
|
||||
])
|
||||
],
|
||||
)
|
||||
|
||||
@ -11,12 +11,14 @@ struct CLI: AsyncParsableCommand {
|
||||
}
|
||||
|
||||
struct Scrape: AsyncParsableCommand {
|
||||
static let configuration: CommandConfiguration = .init(abstract: "Scrape all necessary data.")
|
||||
static let configuration = CommandConfiguration(abstract: "Scrape all necessary data.")
|
||||
|
||||
mutating func run() async throws {
|
||||
print("scrape")
|
||||
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
|
||||
_ = try await fetcher.fetchAToZ()
|
||||
print("Fetch complete!")
|
||||
let content = try await fetcher.fetchHTML(from: "https://www.gocomics.com/comics/a-to-z",
|
||||
waitFor: AToZParser.baseSelector)
|
||||
let parsed = try AToZParser.parse(content: content)
|
||||
print("Fetch complete: \(parsed.count) features fetched.")
|
||||
print("\(parsed.count(where: \.isUpdated)) features updated today.")
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@ import Foundation
|
||||
#endif
|
||||
|
||||
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
|
||||
public final class PageFetcher {
|
||||
public struct PageFetcher {
|
||||
private let webDriverURL: URL
|
||||
private let session = URLSession.shared
|
||||
|
||||
@ -23,9 +23,12 @@ public final class PageFetcher {
|
||||
/// - Parameter url: The URL to fetch content from
|
||||
/// - Returns: The raw HTML content as a string
|
||||
/// - Throws: PageFetcherError for various failure scenarios
|
||||
public func fetchHTML(from url: String) async throws -> String {
|
||||
public func fetchHTML(from url: String, waitFor element: String?) async throws -> String {
|
||||
let sessionId = try await startSession()
|
||||
try await navigateToURL(url, sessionId: sessionId)
|
||||
if let element {
|
||||
try await waitUntilElement(sessionId: sessionId, selector: element, attempts: 5)
|
||||
}
|
||||
let source = try await getPageSource(sessionId: sessionId)
|
||||
try await endSession(sessionId: sessionId)
|
||||
return source
|
||||
@ -66,6 +69,26 @@ public final class PageFetcher {
|
||||
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
|
||||
}
|
||||
|
||||
private func findElement(sessionId: String, selector: String) async throws -> Bool {
|
||||
do {
|
||||
_ = try await makeRequest(verb: "POST", path: "element", sessionId: sessionId, body: ["using": "css selector", "value": selector])
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
private func waitUntilElement(sessionId: String, selector: String, attempts: Int) async throws {
|
||||
for _ in 0 ..< attempts {
|
||||
let success = try await findElement(sessionId: sessionId, selector: selector)
|
||||
if success {
|
||||
return
|
||||
} else {
|
||||
try await Task.sleep(nanoseconds: 1 * 1_000_000_000)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func getPageSource(sessionId: String) async throws -> String {
|
||||
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
|
||||
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
@ -95,7 +118,7 @@ public final class PageFetcher {
|
||||
guard let httpResponse = response as? HTTPURLResponse else {
|
||||
throw PageFetcherError.invalidSessionResponse
|
||||
}
|
||||
guard httpResponse.statusCode == 200 else {
|
||||
guard httpResponse.statusCode < 400 else {
|
||||
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
|
||||
[String: Any]
|
||||
let value = maybeJSON?["value"] as? [String: Any]
|
||||
@ -112,10 +135,3 @@ public final class PageFetcher {
|
||||
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
|
||||
}
|
||||
}
|
||||
|
||||
public extension PageFetcher {
|
||||
/// Convenience method to fetch GoComics A-Z page
|
||||
func fetchAToZ() async throws -> String {
|
||||
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
|
||||
}
|
||||
}
|
||||
|
||||
33
Sources/PuttyKit/Parsers/AToZ.swift
Normal file
33
Sources/PuttyKit/Parsers/AToZ.swift
Normal file
@ -0,0 +1,33 @@
|
||||
import Foundation
|
||||
import SwiftSoup
|
||||
|
||||
public enum AToZParser {
|
||||
public static let baseSelector = "section.FilterComics_filterComics__9VGXB"
|
||||
static let cardSelector = "a[data-analytics-type=\"feature_card\"]"
|
||||
static let titleSelector = "h3"
|
||||
static let bylineSelector = "h4"
|
||||
static let imageSelector = "img"
|
||||
static let updatedSelector = "span"
|
||||
|
||||
public static func parse(content: String) throws -> [Feature] {
|
||||
let document = try SwiftSoup.parse(content)
|
||||
let content = try element(from: document, selector: baseSelector)
|
||||
let links = try content.select(cardSelector)
|
||||
let features = try links.compactMap { card in
|
||||
let identifier = try card.attr("href")
|
||||
let title = try element(from: card, selector: titleSelector).text()
|
||||
let byline = try element(from: card, selector: bylineSelector).text()
|
||||
let image = try card.select(imageSelector).first().map { try $0.attr("src") }.flatMap { URL(string: $0) }
|
||||
let span = try card.select(updatedSelector).first()
|
||||
return Feature(identifier: identifier, title: title, byline: byline, imageURL: image, isUpdated: span != nil)
|
||||
}
|
||||
return features
|
||||
}
|
||||
|
||||
static func element(from document: SwiftSoup.Element, selector: String) throws -> SwiftSoup.Element {
|
||||
guard let result = try document.select(selector).first() else {
|
||||
throw ParserError.missingElement(selector)
|
||||
}
|
||||
return result
|
||||
}
|
||||
}
|
||||
9
Sources/PuttyKit/Parsers/Models/Feature.swift
Normal file
9
Sources/PuttyKit/Parsers/Models/Feature.swift
Normal file
@ -0,0 +1,9 @@
|
||||
import Foundation
|
||||
|
||||
public struct Feature {
|
||||
public let identifier: String
|
||||
public let title: String
|
||||
public let byline: String
|
||||
public let imageURL: URL?
|
||||
public let isUpdated: Bool
|
||||
}
|
||||
19
Sources/PuttyKit/Parsers/ParserError.swift
Normal file
19
Sources/PuttyKit/Parsers/ParserError.swift
Normal file
@ -0,0 +1,19 @@
|
||||
import Foundation
|
||||
|
||||
/// Describes any errors that can happen with a Parser.
|
||||
public enum ParserError: Error, LocalizedError {
|
||||
case devError(String)
|
||||
case encodingError
|
||||
case missingElement(String)
|
||||
|
||||
public var errorDescription: String? {
|
||||
switch self {
|
||||
case let .devError(string):
|
||||
"devError: \(string)"
|
||||
case .encodingError:
|
||||
"Error encoding content to UTF-8"
|
||||
case let .missingElement(element):
|
||||
"Failed to find element: \(element)"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user