all repos — archive/microformats @ 20fa6dcbf6cdb737155e1928455133ebbc0654bf

Incomplete Clojure microformats library

src/microformats/parser.clj (view raw)

(ns microformats.parser
  (:require [net.cgrand.enlive-html :as html]
            [clojure.zip :as z]
            [clojure.core.reducers :as r]
            [clojure.set :as set]
            [clojure.string :as str]
            [clojurewerkz.urly.core :as url]))

(defn- as-locs
  [loc]
  (if (and (vector? loc) (= (count loc) 2) (not (vector? (first loc))))
    [loc]
    loc))

(defn- zip-select
  "Workaround https://github.com/cgrand/enlive/issues/117"
  [loc-or-locs selector]
  (html/zip-select (as-locs loc-or-locs) (list (@#'html/automaton selector))))

(defmacro anacond
  [& clauses]
  (when clauses
    (list 'if-let ['% (first clauses)]
          (if (next clauses)
            (second clauses)
            (throw (IllegalArgumentException.
                    "anacond requires an even number of forms")))
          (cons 'anacond (next (next clauses))))))

(defn prefixed-by?
  [prefix]
  #(.startsWith % prefix))

(defn remove-mf-prefix
  "Remove microformats prefixes from a class attribute"
  [prefix]
  #(apply str (drop (count prefix) %)))

(defn- split-ws-attribute
  "Split a whitespace-separated attribute."
  [class]
  (str/split class #"\s+"))

(defn classes-to-props
  "Convert class list to list of microformat property keywords"
  [prefix]
  (comp (r/map keyword)
        (r/map (remove-mf-prefix prefix))
        (r/filter (prefixed-by? prefix))))

(defn element-to-classes
  "Get list of classes from an element"
  [el] (some-> el
               :attrs
               :class
               split-ws-attribute))

(defn element-to-rels
  "Get list of rels from an element"
  [el] (-> el
           :attrs
           :rel
           split-ws-attribute))

(defn- node-to-html
  "Turn a node into a list of HTML strings"
  [el]
  (map #(if (string? %)
          %
          (apply str (persistent! (html/emit-tag % (transient []))))) el))

(defn- node-to-text
  "Turn a node into a text string"
  [content]
  (->> content
       html/texts
       (apply str)
       (#(str/replace % #"\s+" " "))
       str/trim))

(defn get-base-url
  "Find the base-url of a document."
  [document]
  (or (some-> document
              meta
              :base)
      (-> document
          (html/select [:head :> [:base (html/attr? :href)]])
          first
          :attrs
          :href)
      ""))

(defn with-base-url
  "Attach the base URL of a document as metadata"
  ([document]
     (with-base-url (get-base-url document) document))
  ([base-url document]
     (if (instance? clojure.lang.IObj document)
       (vary-meta document assoc :base base-url)
       document)))

(defn normalise-url
  "Normalise a URL"
  [root url]
  (url/resolve (get-base-url root) url))

(defn get-value-title-class
  "Get the value-title class of elements"
  [elements]
  (str/join " " (into [] ((comp (r/map :title)
                                (r/map :attrs))
                          elements))))

(defn get-value-class
  "Get the value class of elements"
  [elements]
  (str/join " " (into [] ((comp (r/map (partial apply str))
                                (r/map node-to-text)
                                (r/map :content))
                          elements))))

(defn find-value-class
  "Find and get the value class of elements"
  [el]
  (anacond
   (not-empty (html/select el [html/root :> :.value-title]))
   (get-value-title-class %)
   (not-empty (html/select el [html/root :> :.value ]))
   (get-value-class %)))

(declare parse-h)

(defn remove-property-classes
  [element]
  (into {} (html/transform (list element) [html/root]
                           (apply html/remove-class (filter (prefixed-by? "p-")
                                                            (element-to-classes element))))))
(defn get-child-mf-properties
  [loc]
  (assoc (parse-h (z/edit loc remove-property-classes)) :value (-> loc z/node :content node-to-text)))

(defn- find-child-mf
  "Find child property microformats of an element."
  [loc]
  (let [element (z/node loc)]
    (when (-> element :attrs :class (.indexOf "h-") (>= 0))
     (get-child-mf-properties loc))))

(defn get-p-value
  "Get the p-x property value of an element"
  [loc]
  (let [el (z/node loc)]
    (or (find-child-mf loc)
        (str/trim (or (find-value-class el)
                      (case (:tag el)
                        :img (-> el :attrs :alt)
                        :area (-> el :attrs :alt)
                        :abbr (-> el :attrs :title)
                        :data (-> el :attrs :value)
                        :input (-> el :attrs :value)
                        nil)
                      (node-to-text (:content el))
                      "")))))

(defn get-u-value
  "Get the u-x property value of an element"
  [loc]
  (let [el (z/node loc)]
    (str/trim (or (find-value-class el)
                  (case (:tag el)
                    :a (normalise-url (z/root loc) (-> el :attrs :href))
                    :area (normalise-url (z/root loc) (-> el :attrs :href))
                    :img (normalise-url (z/root loc) (-> el :attrs :src))
                    :object (normalise-url (z/root loc) (-> el :attrs :data))
                    (get-p-value loc))
                  (node-to-text (:content el))
                  ""))))

(defn get-dt-value
  "Get the dt-x property value of an element"
  [loc]
  (let [el (z/node loc)]
    (str/trim (or (find-value-class el)
                  (case (:tag el)
                    :time (-> el :attrs :datetime)
                    :ins  (-> el :attrs :datetime)
                    :del  (-> el :attrs :datetime)
                    :abbr (-> el :attrs :title)
                    :data (-> el :attrs :value)
                    :input (-> el :attrs :value)
                    nil)
                  (node-to-text (:content el))
                  ""))))

(defn get-e-value
  "Get the e-x propery value of an element"
  [loc]
  (let [el (z/node loc)
        content (:content el)]
    (list {:html (apply str (node-to-html content))
           :value (apply str (node-to-text content))})))

(declare walk-children)

(defn gen-property-parser
  "Create a property parser"
  [f]
  (fn [loc]
    (apply (partial merge-with concat)
           (f loc)
           (walk-children loc))))

(def parse-p
  "Parse p-* classes within HTML element."
  (gen-property-parser
   (fn [loc]
     (->> loc
          z/node
          element-to-classes
          ((classes-to-props "p-"))
          (r/map #(hash-map % (list (get-p-value loc))))
          (into {})))))

(def parse-u
  "Parse u-* classes within HTML element"
  (gen-property-parser
   (fn [loc]
     (->> loc
          z/node
          element-to-classes
          ((classes-to-props "u-"))
          (r/map #(hash-map % (list (get-u-value loc))))
          (into {})))))

(def parse-dt
  "Parse dt-* classes within HTML element"
  (gen-property-parser
   (fn [loc]
     (->> loc
          z/node
          element-to-classes
          ((classes-to-props "dt-"))
          (r/map #(hash-map % (list (get-dt-value loc))))
          (into {})))))

(def parse-e
  "Parse e-* classes within HTML element"
  (gen-property-parser
   (fn [loc]
     (->> loc
          z/node
          element-to-classes
          ((classes-to-props "e-"))
          (r/map #(hash-map % (get-e-value loc)))
          (into {})))))

(defn- get-mf-names
  "Get the microformat names from an element"
  [element]
  (->> element
       element-to-classes
       (filter (prefixed-by? "h-"))))

(defn- parse-implied-name
  "Get the implied name of an entity"
  [loc]
  (let [element (z/node loc)]
    (case (:tag element)
      :abbr (-> element :attrs :title)
      :img (-> element :attrs :alt)
      (anacond
       (first (html/select element [html/root :> [:img html/only-child]]))
       (-> % :attrs :alt)
       (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]]))
       (-> % :attrs :title)
       (first (html/select element [html/root :> html/only-child :> [:img html/only-child]]))
       (-> % :attrs :alt)
       (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]]))
       (-> % :attrs :title)
       true (node-to-text (:content element))))))

(defn- parse-implied-url
  [loc]
  (let [element (z/node loc)]
    (some->>
     (case (:tag element)
       :a (-> element :attrs :href)
       (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))]
         (-> % :attrs :href)))
     (normalise-url (z/root loc)))))

(defn- parse-implied-photo
  [loc]
  (let [element (z/node loc)]
    (some->>
     (case (:tag element)
       :img (-> element :attrs :src)
       :object (-> element :attrs :data)
       (anacond
        (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
        (-> % :attrs :src)
        (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
        (-> % :attrs :data)
        (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
        (-> % :attrs :src)
        (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
        (-> % :attrs :data)))
     (normalise-url (z/root loc)))))

(def empty-ish
  #(not (str/blank? (first (second %)))))

(defn parse-implied
  "Parse implied properties of a HTML element"
  [loc]
  (into {} (filter empty-ish
                   {:name (list (parse-implied-name loc))
                    :url (list (parse-implied-url loc))
                    :photo (list (parse-implied-photo loc))})))

(defn parse-h
  "Parse h-* classes within a HTML element."
  [loc]
  (hash-map :type (get-mf-names (z/node loc))
            :properties (merge (parse-implied loc)
                               (apply merge-with concat (walk-children loc)))))

(defn parse-mf
  "Parse microformats within a HTML element."
  [loc mf-type]
  (case mf-type
    "h" (parse-h loc)
    "p" (parse-p loc)
    "u" (parse-u loc)
    "dt" (parse-dt loc)
    "e" (parse-e loc)))

(defn has-child?
  [types] (set/subset? #{"p" "h"} types))

(defn single-pass-child
  "Ensure a child microformat of a property is only parsed as a child"
  [types]
  (if (has-child? types)
    (remove #(= "h" %) types)
    types))

(defn walk
  "Walk HTML element tree for microformat properties."
  [loc]
  (when (and (not (z/end? loc))
             (not (contains? #{:br :hr} (-> loc z/node :tag))))
    (if-let [types (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) set)]
      (map (partial parse-mf loc) (single-pass-child types))
      (recur (z/next loc)))))

(defn map-walk
  [root]
  (comp (r/map (partial apply merge))
        (r/filter identity)
        (r/map walk)
        (r/map z/xml-zip)
        (r/map (partial with-base-url (get-base-url root)))))

(defn walk-children
  "Walk through child elements of loc"
  [loc]
  (some->> loc z/children ((map-walk (z/root loc))) (into [])))

(defn parse-rel
  "Parse rel attributes of an HTML link element"
  [loc]
  (->> loc
       z/node
       element-to-rels
       (map keyword)
       (map #(hash-map % [(normalise-url (z/root loc) (-> loc z/node :attrs :href))]))
       (into {})))

(defn select-rels
  "Select linking HTML elements with rel attributes"
  [html] (zip-select html [[#{:a :link} (html/attr? :rel)]]))

(defn parse-rels
  "Parse rel attibutes of a set of HTML link elements"
  [locs]
  (or (apply merge-with into (map parse-rel (select-rels locs))) {}))

(defprotocol node
  (to-node [html]))

(extend-protocol node
  String
  (to-node [html]
    (some->> html str/trim html/html-snippet)))

(extend-protocol node
  clojure.lang.LazySeq
  (to-node [html]
    html))

(defn parse
  "Parse a HTML string with microformats"
  ([html]
     (let [document (some->> html to-node with-base-url)]
       (parse document (get-base-url document))))
  ([html base-url]
     (let [document (some->> html to-node first (with-base-url base-url) z/xml-zip)]
       {:items (some-> document walk) :rels (parse-rels document)})))