SyndicationDomination 0.0
An RSS/Atom parser, because there's nothing else out there.
Loading...
Searching...
No Matches
utils.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <algorithm>
4#include <string>
5#include <sstream>
6#include <pugixml.hpp>
7#include <vector>
8#include <chrono>
9#include <fmt/chrono.h>
10
11#include "extraction_param.hpp"
12
13namespace SynDomUtils {
14
15struct xml_string_writer: xml_writer {
16 std::string result;
17 virtual void write(const void* data, size_t size) {
18 result.append(static_cast<const char*>(data), size);
19 }
20};
21
22const std::string ATOM_LINK_TAGS[] = {
23 "link", "atom:link", "atom10:link"
24};
25
26inline bool __trim_filter(unsigned char ch) {
27 return !std::isspace(ch);
28}
29
31inline void ltrim(std::string &s) {
32 s.erase(s.begin(), std::find_if(s.begin(), s.end(), __trim_filter));
33}
34
36inline void rtrim(std::string &s) {
37 s.erase(
38 std::find_if(s.rbegin(), s.rend(), __trim_filter).base(), s.end()
39 );
40}
41
43inline void trim(std::string &s) { ltrim(s); rtrim(s); }
44
46inline void lower(std::string &s) {
47 std::transform(s.begin(), s.end(), s.begin(), ::tolower);
48}
49
56inline bool str_has_prefix(std::string s, std::string prefix) {
57 return (s.rfind(prefix, 0) == 0);
58}
59
67inline std::vector<std::string> split(std::string s, char delim) {
68 std::vector<std::string> result;
69 std::stringstream ss(s);
70 std::string item;
71 while (getline (ss, item, delim)) {
72 result.push_back(item);
73 }
74 return result;
75}
76
80inline std::string current_time() {
81 auto now = std::chrono::system_clock::now();
82 std::string res = fmt::format("{:%Y-%m-%dT%H:%M:%S}", now);
83 return res;
84}
85
87inline bool is_url(std::string s) {
88 return !s.empty() && (
89 str_has_prefix(s, "https://") ||
90 str_has_prefix(s, "http://")
91 );
92}
93
102inline std::string extract_from_node(
103 pugi::xml_node node, std::vector<ExtractionParam> params
104) {
105 std::string res = "";
106 for (auto param: params) {
107 auto child = node;
108 for (auto tag: param.tags) {
109 if (!child) break;
110 child = child.child(tag.c_str());
111 }
112 if (!child) continue;
113
114 if (param.type == ExtractionParam::ParamType::CHILD) {
115 res = child.text().as_string();
116 // if the child content is not CDATA or text, then it's probably
117 // xhtml, so we grab that with the writer
118 if (
119 child && res.empty() &&
120 child.first_child().type() != node_cdata &&
121 child.first_child()
122 ) {
123 xml_node child_cpy = child;
124 child_cpy.set_name("article");
125 xml_string_writer writer;
126 child_cpy.print(writer, "");
127 if (writer.result != "<article />") {
128 res = writer.result;
129 }
130 }
131 }
132 else {
133 res = child.attribute(param.attribute.c_str()).value();
134 }
135
136 if (!res.empty()) {
137 trim(res);
138 return res;
139 }
140 }
141 return "";
142}
143
155inline std::string extract_link(
156 pugi::xml_node node,
157 std::vector<std::string> rels,
158 std::vector<std::string> types,
159 bool opt_rel=false, bool opt_type=false
160) {
161 std::string res = "";
162 std::string attr_rel, attr_type;
163 for (auto link_tag: ATOM_LINK_TAGS) {
164 pugi::xml_node link_node = node.child(link_tag.c_str());
165 while(link_node) {
166 attr_rel = link_node.attribute("rel").value();
167 attr_type = link_node.attribute("type").value();
168 if (
169 ((opt_rel && attr_rel.empty()) || std::find(
170 rels.begin(), rels.end(), attr_rel
171 ) != rels.end()) &&
172 ((opt_type && attr_type.empty()) || std::find(
173 types.begin(), types.end(), attr_type
174 ) != types.end())
175 ) {
176 res = link_node.attribute("href").value();
177 if (!res.empty()) return res;
178 }
179 link_node = link_node.next_sibling(link_tag.c_str());
180 }
181 }
182
183 return "";
184}
185
186};
Definition utils.hpp:13
std::string extract_from_node(pugi::xml_node node, std::vector< ExtractionParam > params)
Searches for a certain value, starting from a provided pugi::xml_node, using a vector of ExtractionPa...
Definition utils.hpp:102
void lower(std::string &s)
Changes the string case to all lowercase, in place.
Definition utils.hpp:46
bool is_url(std::string s)
Rudimentarily checks if the provided string is a URL.
Definition utils.hpp:87
bool __trim_filter(unsigned char ch)
Definition utils.hpp:26
const std::string ATOM_LINK_TAGS[]
Definition utils.hpp:22
std::string current_time()
Returns the current date and time in ISO 8601 format, with UTC offset.
Definition utils.hpp:80
void trim(std::string &s)
Trims a string from both ends, in place.
Definition utils.hpp:43
std::string extract_link(pugi::xml_node node, std::vector< std::string > rels, std::vector< std::string > types, bool opt_rel=false, bool opt_type=false)
Searches for a certain value pertaining to a <link /> node.
Definition utils.hpp:155
void rtrim(std::string &s)
Trims a string from the end, in place.
Definition utils.hpp:36
std::vector< std::string > split(std::string s, char delim)
Splits a string into an std::vector of strings, using a single character delimiter.
Definition utils.hpp:67
bool str_has_prefix(std::string s, std::string prefix)
Checks if the string s has the prefix prefix.
Definition utils.hpp:56
void ltrim(std::string &s)
Trims a string from the start, in place.
Definition utils.hpp:31
@ CHILD
Definition extraction_param.hpp:18
Definition utils.hpp:15
virtual void write(const void *data, size_t size)
Definition utils.hpp:17
std::string result
Definition utils.hpp:16