(ns microformats.parser
(:require [net.cgrand.enlive-html :as html]
[clojure.zip :as z]
[clojure.core.reducers :as r]
[clojure.set :as set]
[clojure.string :as str]
[clojurewerkz.urly.core :as url]))
(defn- as-locs
[loc]
(if (and (vector? loc) (= (count loc) 2) (not (vector? (first loc))))
[loc]
loc))
(defn- zip-select
"Workaround https://github.com/cgrand/enlive/issues/117"
[loc-or-locs selector]
(html/zip-select (as-locs loc-or-locs) (list (@#'html/automaton selector))))
(defmacro anacond
[& clauses]
(when clauses
(list 'if-let ['% (first clauses)]
(if (next clauses)
(second clauses)
(throw (IllegalArgumentException.
"anacond requires an even number of forms")))
(cons 'anacond (next (next clauses))))))
(defn prefixed-by?
[prefix]
#(.startsWith % prefix))
(defn remove-mf-prefix
"Remove microformats prefixes from a class attribute"
[prefix]
#(apply str (drop (count prefix) %)))
(defn- split-ws-attribute
"Split a whitespace-separated attribute."
[class]
(str/split class #"\s+"))
(defn classes-to-props
"Convert class list to list of microformat property keywords"
[prefix]
(comp (r/map keyword)
(r/map (remove-mf-prefix prefix))
(r/filter (prefixed-by? prefix))))
(defn element-to-classes
"Get list of classes from an element"
[el] (some-> el
:attrs
:class
split-ws-attribute))
(defn element-to-rels
"Get list of rels from an element"
[el] (-> el
:attrs
:rel
split-ws-attribute))
(defn- node-to-html
"Turn a node into a list of HTML strings"
[el]
(map #(if (string? %)
%
(apply str (persistent! (html/emit-tag % (transient []))))) el))
(defn- node-to-text
"Turn a node into a text string"
[content]
(->> content
html/texts
(apply str)
(#(str/replace % #"\s+" " "))
str/trim))
(defn get-base-url
"Find the base-url of a document."
[document]
(or (some-> document
meta
:base)
(-> document
(html/select [:head :> [:base (html/attr? :href)]])
first
:attrs
:href)
""))
(defn with-base-url
"Attach the base URL of a document as metadata"
([document]
(with-base-url (get-base-url document) document))
([base-url document]
(if (instance? clojure.lang.IObj document)
(vary-meta document assoc :base base-url)
document)))
(defn normalise-url
"Normalise a URL"
[root url]
(url/resolve (get-base-url root) url))
(defn get-value-title-class
"Get the value-title class of elements"
[elements]
(str/join " " (into [] ((comp (r/map :title)
(r/map :attrs))
elements))))
(defn get-value-class
"Get the value class of elements"
[elements]
(str/join " " (into [] ((comp (r/map (partial apply str))
(r/map node-to-text)
(r/map :content))
elements))))
(defn find-value-class
"Find and get the value class of elements"
[el]
(anacond
(not-empty (html/select el [html/root :> :.value-title]))
(get-value-title-class %)
(not-empty (html/select el [html/root :> :.value ]))
(get-value-class %)))
(declare parse-h)
(defn remove-property-classes
[element]
(into {} (html/transform (list element) [html/root]
(apply html/remove-class (filter (prefixed-by? "p-")
(element-to-classes element))))))
(defn get-child-mf-properties
[loc]
(assoc (parse-h (z/edit loc remove-property-classes)) :value (-> loc z/node :content node-to-text)))
(defn- find-child-mf
"Find child property microformats of an element."
[loc]
(let [element (z/node loc)]
(when (-> element :attrs :class (.indexOf "h-") (>= 0))
(get-child-mf-properties loc))))
(defn get-p-value
"Get the p-x property value of an element"
[loc]
(let [el (z/node loc)]
(or (find-child-mf loc)
(str/trim (or (find-value-class el)
(case (:tag el)
:img (-> el :attrs :alt)
:area (-> el :attrs :alt)
:abbr (-> el :attrs :title)
:data (-> el :attrs :value)
:input (-> el :attrs :value)
nil)
(node-to-text (:content el))
"")))))
(defn get-u-value
"Get the u-x property value of an element"
[loc]
(let [el (z/node loc)]
(str/trim (or (find-value-class el)
(case (:tag el)
:a (normalise-url (z/root loc) (-> el :attrs :href))
:area (normalise-url (z/root loc) (-> el :attrs :href))
:img (normalise-url (z/root loc) (-> el :attrs :src))
:object (normalise-url (z/root loc) (-> el :attrs :data))
(get-p-value loc))
(node-to-text (:content el))
""))))
(defn get-dt-value
"Get the dt-x property value of an element"
[loc]
(let [el (z/node loc)]
(str/trim (or (find-value-class el)
(case (:tag el)
:time (-> el :attrs :datetime)
:ins (-> el :attrs :datetime)
:del (-> el :attrs :datetime)
:abbr (-> el :attrs :title)
:data (-> el :attrs :value)
:input (-> el :attrs :value)
nil)
(node-to-text (:content el))
""))))
(defn get-e-value
"Get the e-x propery value of an element"
[loc]
(let [el (z/node loc)
content (:content el)]
(list {:html (apply str (node-to-html content))
:value (apply str (node-to-text content))})))
(declare walk-children)
(defn gen-property-parser
"Create a property parser"
[f]
(fn [loc]
(apply (partial merge-with concat)
(f loc)
(walk-children loc))))
(def parse-p
"Parse p-* classes within HTML element."
(gen-property-parser
(fn [loc]
(->> loc
z/node
element-to-classes
((classes-to-props "p-"))
(r/map #(hash-map % (list (get-p-value loc))))
(into {})))))
(def parse-u
"Parse u-* classes within HTML element"
(gen-property-parser
(fn [loc]
(->> loc
z/node
element-to-classes
((classes-to-props "u-"))
(r/map #(hash-map % (list (get-u-value loc))))
(into {})))))
(def parse-dt
"Parse dt-* classes within HTML element"
(gen-property-parser
(fn [loc]
(->> loc
z/node
element-to-classes
((classes-to-props "dt-"))
(r/map #(hash-map % (list (get-dt-value loc))))
(into {})))))
(def parse-e
"Parse e-* classes within HTML element"
(gen-property-parser
(fn [loc]
(->> loc
z/node
element-to-classes
((classes-to-props "e-"))
(r/map #(hash-map % (get-e-value loc)))
(into {})))))
(defn- get-mf-names
"Get the microformat names from an element"
[element]
(->> element
element-to-classes
(filter (prefixed-by? "h-"))))
(defn- parse-implied-name
"Get the implied name of an entity"
[loc]
(let [element (z/node loc)]
(case (:tag element)
:abbr (-> element :attrs :title)
:img (-> element :attrs :alt)
(anacond
(first (html/select element [html/root :> [:img html/only-child]]))
(-> % :attrs :alt)
(first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]]))
(-> % :attrs :title)
(first (html/select element [html/root :> html/only-child :> [:img html/only-child]]))
(-> % :attrs :alt)
(first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]]))
(-> % :attrs :title)
true (node-to-text (:content element))))))
(defn- parse-implied-url
[loc]
(let [element (z/node loc)]
(some->>
(case (:tag element)
:a (-> element :attrs :href)
(if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))]
(-> % :attrs :href)))
(normalise-url (z/root loc)))))
(defn- parse-implied-photo
[loc]
(let [element (z/node loc)]
(some->>
(case (:tag element)
:img (-> element :attrs :src)
:object (-> element :attrs :data)
(anacond
(first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
(-> % :attrs :src)
(first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
(-> % :attrs :data)
(first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
(-> % :attrs :src)
(first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
(-> % :attrs :data)))
(normalise-url (z/root loc)))))
(def empty-ish
#(not (str/blank? (first (second %)))))
(defn parse-implied
"Parse implied properties of a HTML element"
[loc]
(into {} (filter empty-ish
{:name (list (parse-implied-name loc))
:url (list (parse-implied-url loc))
:photo (list (parse-implied-photo loc))})))
(defn parse-h
"Parse h-* classes within a HTML element."
[loc]
(hash-map :type (get-mf-names (z/node loc))
:properties (merge (parse-implied loc)
(apply merge-with concat (walk-children loc)))))
(defn parse-mf
"Parse microformats within a HTML element."
[loc mf-type]
(case mf-type
"h" (parse-h loc)
"p" (parse-p loc)
"u" (parse-u loc)
"dt" (parse-dt loc)
"e" (parse-e loc)))
(defn has-child?
[types] (set/subset? #{"p" "h"} types))
(defn single-pass-child
"Ensure a child microformat of a property is only parsed as a child"
[types]
(if (has-child? types)
(remove #(= "h" %) types)
types))
(defn walk
"Walk HTML element tree for microformat properties."
[loc]
(when (and (not (z/end? loc))
(not (contains? #{:br :hr} (-> loc z/node :tag))))
(if-let [types (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) set)]
(map (partial parse-mf loc) (single-pass-child types))
(recur (z/next loc)))))
(defn map-walk
[root]
(comp (r/map (partial apply merge))
(r/filter identity)
(r/map walk)
(r/map z/xml-zip)
(r/map (partial with-base-url (get-base-url root)))))
(defn walk-children
"Walk through child elements of loc"
[loc]
(some->> loc z/children ((map-walk (z/root loc))) (into [])))
(defn parse-rel
"Parse rel attributes of an HTML link element"
[loc]
(->> loc
z/node
element-to-rels
(map keyword)
(map #(hash-map % [(normalise-url (z/root loc) (-> loc z/node :attrs :href))]))
(into {})))
(defn select-rels
"Select linking HTML elements with rel attributes"
[html] (zip-select html [[#{:a :link} (html/attr? :rel)]]))
(defn parse-rels
"Parse rel attibutes of a set of HTML link elements"
[locs]
(or (apply merge-with into (map parse-rel (select-rels locs))) {}))
(defprotocol node
(to-node [html]))
(extend-protocol node
String
(to-node [html]
(some->> html str/trim html/html-snippet)))
(extend-protocol node
clojure.lang.LazySeq
(to-node [html]
html))
(defn parse
"Parse a HTML string with microformats"
([html]
(let [document (some->> html to-node with-base-url)]
(parse document (get-base-url document))))
([html base-url]
(let [document (some->> html to-node first (with-base-url base-url) z/xml-zip)]
{:items (some-> document walk) :rels (parse-rels document)})))
src/microformats/parser.clj (view raw)