Fill out AToZ parser to my liking
This commit is contained in:
parent
db9127e891
commit
f05aa1161c
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"originHash" : "7168671e688b0ccae86155da01e80e0d8c44c91c80a5989a6c075f4f2ae64b75",
|
"originHash" : "26a62968da6c130d62ba93c66650932d484e42350b23f874d07bd9961277a3b9",
|
||||||
"pins" : [
|
"pins" : [
|
||||||
{
|
{
|
||||||
"identity" : "swift-argument-parser",
|
"identity" : "swift-argument-parser",
|
||||||
@ -9,6 +9,15 @@
|
|||||||
"revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3",
|
"revision" : "309a47b2b1d9b5e991f36961c983ecec72275be3",
|
||||||
"version" : "1.6.1"
|
"version" : "1.6.1"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identity" : "swiftsoup",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/scinfu/SwiftSoup",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "3a439f9eccc391b264d54516ce640251552eb0c4",
|
||||||
|
"version" : "2.10.3"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"version" : 3
|
"version" : 3
|
||||||
|
|||||||
@ -11,15 +11,18 @@ let package = Package(
|
|||||||
],
|
],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.6.1"),
|
||||||
|
.package(url: "https://github.com/scinfu/SwiftSoup", from: "2.6.0"),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.target(name: "PuttyKit"),
|
.target(name: "PuttyKit",
|
||||||
|
dependencies: ["SwiftSoup"]),
|
||||||
.executableTarget(name: "putty",
|
.executableTarget(name: "putty",
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||||
.target(name: "PuttyKit"),
|
.target(name: "PuttyKit"),
|
||||||
],
|
],
|
||||||
path: "Sources/CLI"),
|
path: "Sources/CLI"),
|
||||||
])
|
],
|
||||||
|
)
|
||||||
|
|||||||
@ -11,12 +11,14 @@ struct CLI: AsyncParsableCommand {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct Scrape: AsyncParsableCommand {
|
struct Scrape: AsyncParsableCommand {
|
||||||
static let configuration: CommandConfiguration = .init(abstract: "Scrape all necessary data.")
|
static let configuration = CommandConfiguration(abstract: "Scrape all necessary data.")
|
||||||
|
|
||||||
mutating func run() async throws {
|
mutating func run() async throws {
|
||||||
print("scrape")
|
|
||||||
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
|
let fetcher = PageFetcher(webDriverURL: "http://browser:4444")
|
||||||
_ = try await fetcher.fetchAToZ()
|
let content = try await fetcher.fetchHTML(from: "https://www.gocomics.com/comics/a-to-z",
|
||||||
print("Fetch complete!")
|
waitFor: AToZParser.baseSelector)
|
||||||
|
let parsed = try AToZParser.parse(content: content)
|
||||||
|
print("Fetch complete: \(parsed.count) features fetched.")
|
||||||
|
print("\(parsed.count(where: \.isUpdated)) features updated today.")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import Foundation
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
|
/// A class that connects to a Selenium WebDriver running Chromium to fetch HTML page content
|
||||||
public final class PageFetcher {
|
public struct PageFetcher {
|
||||||
private let webDriverURL: URL
|
private let webDriverURL: URL
|
||||||
private let session = URLSession.shared
|
private let session = URLSession.shared
|
||||||
|
|
||||||
@ -23,9 +23,12 @@ public final class PageFetcher {
|
|||||||
/// - Parameter url: The URL to fetch content from
|
/// - Parameter url: The URL to fetch content from
|
||||||
/// - Returns: The raw HTML content as a string
|
/// - Returns: The raw HTML content as a string
|
||||||
/// - Throws: PageFetcherError for various failure scenarios
|
/// - Throws: PageFetcherError for various failure scenarios
|
||||||
public func fetchHTML(from url: String) async throws -> String {
|
public func fetchHTML(from url: String, waitFor element: String?) async throws -> String {
|
||||||
let sessionId = try await startSession()
|
let sessionId = try await startSession()
|
||||||
try await navigateToURL(url, sessionId: sessionId)
|
try await navigateToURL(url, sessionId: sessionId)
|
||||||
|
if let element {
|
||||||
|
try await waitUntilElement(sessionId: sessionId, selector: element, attempts: 5)
|
||||||
|
}
|
||||||
let source = try await getPageSource(sessionId: sessionId)
|
let source = try await getPageSource(sessionId: sessionId)
|
||||||
try await endSession(sessionId: sessionId)
|
try await endSession(sessionId: sessionId)
|
||||||
return source
|
return source
|
||||||
@ -66,6 +69,26 @@ public final class PageFetcher {
|
|||||||
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
|
_ = try await makeRequest(verb: "POST", path: "url", sessionId: sessionId, body: ["url": url])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private func findElement(sessionId: String, selector: String) async throws -> Bool {
|
||||||
|
do {
|
||||||
|
_ = try await makeRequest(verb: "POST", path: "element", sessionId: sessionId, body: ["using": "css selector", "value": selector])
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func waitUntilElement(sessionId: String, selector: String, attempts: Int) async throws {
|
||||||
|
for _ in 0 ..< attempts {
|
||||||
|
let success = try await findElement(sessionId: sessionId, selector: selector)
|
||||||
|
if success {
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
try await Task.sleep(nanoseconds: 1 * 1_000_000_000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private func getPageSource(sessionId: String) async throws -> String {
|
private func getPageSource(sessionId: String) async throws -> String {
|
||||||
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
|
let (data, _) = try await makeRequest(verb: "GET", path: "source", sessionId: sessionId)
|
||||||
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||||
@ -95,7 +118,7 @@ public final class PageFetcher {
|
|||||||
guard let httpResponse = response as? HTTPURLResponse else {
|
guard let httpResponse = response as? HTTPURLResponse else {
|
||||||
throw PageFetcherError.invalidSessionResponse
|
throw PageFetcherError.invalidSessionResponse
|
||||||
}
|
}
|
||||||
guard httpResponse.statusCode == 200 else {
|
guard httpResponse.statusCode < 400 else {
|
||||||
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
|
let maybeJSON = try? JSONSerialization.jsonObject(with: data) as?
|
||||||
[String: Any]
|
[String: Any]
|
||||||
let value = maybeJSON?["value"] as? [String: Any]
|
let value = maybeJSON?["value"] as? [String: Any]
|
||||||
@ -112,10 +135,3 @@ public final class PageFetcher {
|
|||||||
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
|
return try await makeRequest(verb: verb, path: "wd/hub/session/\(addendum)", body: body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public extension PageFetcher {
|
|
||||||
/// Convenience method to fetch GoComics A-Z page
|
|
||||||
func fetchAToZ() async throws -> String {
|
|
||||||
try await fetchHTML(from: "https://www.gocomics.com/comics/a-to-z")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
33
Sources/PuttyKit/Parsers/AToZ.swift
Normal file
33
Sources/PuttyKit/Parsers/AToZ.swift
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import Foundation
|
||||||
|
import SwiftSoup
|
||||||
|
|
||||||
|
public enum AToZParser {
|
||||||
|
public static let baseSelector = "section.FilterComics_filterComics__9VGXB"
|
||||||
|
static let cardSelector = "a[data-analytics-type=\"feature_card\"]"
|
||||||
|
static let titleSelector = "h3"
|
||||||
|
static let bylineSelector = "h4"
|
||||||
|
static let imageSelector = "img"
|
||||||
|
static let updatedSelector = "span"
|
||||||
|
|
||||||
|
public static func parse(content: String) throws -> [Feature] {
|
||||||
|
let document = try SwiftSoup.parse(content)
|
||||||
|
let content = try element(from: document, selector: baseSelector)
|
||||||
|
let links = try content.select(cardSelector)
|
||||||
|
let features = try links.compactMap { card in
|
||||||
|
let identifier = try card.attr("href")
|
||||||
|
let title = try element(from: card, selector: titleSelector).text()
|
||||||
|
let byline = try element(from: card, selector: bylineSelector).text()
|
||||||
|
let image = try card.select(imageSelector).first().map { try $0.attr("src") }.flatMap { URL(string: $0) }
|
||||||
|
let span = try card.select(updatedSelector).first()
|
||||||
|
return Feature(identifier: identifier, title: title, byline: byline, imageURL: image, isUpdated: span != nil)
|
||||||
|
}
|
||||||
|
return features
|
||||||
|
}
|
||||||
|
|
||||||
|
static func element(from document: SwiftSoup.Element, selector: String) throws -> SwiftSoup.Element {
|
||||||
|
guard let result = try document.select(selector).first() else {
|
||||||
|
throw ParserError.missingElement(selector)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
}
|
||||||
9
Sources/PuttyKit/Parsers/Models/Feature.swift
Normal file
9
Sources/PuttyKit/Parsers/Models/Feature.swift
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
public struct Feature {
|
||||||
|
public let identifier: String
|
||||||
|
public let title: String
|
||||||
|
public let byline: String
|
||||||
|
public let imageURL: URL?
|
||||||
|
public let isUpdated: Bool
|
||||||
|
}
|
||||||
19
Sources/PuttyKit/Parsers/ParserError.swift
Normal file
19
Sources/PuttyKit/Parsers/ParserError.swift
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import Foundation
|
||||||
|
|
||||||
|
/// Describes any errors that can happen with a Parser.
|
||||||
|
public enum ParserError: Error, LocalizedError {
|
||||||
|
case devError(String)
|
||||||
|
case encodingError
|
||||||
|
case missingElement(String)
|
||||||
|
|
||||||
|
public var errorDescription: String? {
|
||||||
|
switch self {
|
||||||
|
case let .devError(string):
|
||||||
|
"devError: \(string)"
|
||||||
|
case .encodingError:
|
||||||
|
"Error encoding content to UTF-8"
|
||||||
|
case let .missingElement(element):
|
||||||
|
"Failed to find element: \(element)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user