~ruther/guix-local

0f6b9e9828dfc269bfc4eade771efed1753e8c62 — Ricardo Wurmus 10 years ago b6a2227
import: cran: Parse DESCRIPTION instead of HTML.

* guix/import/cran.scm (description->alist, listify,
  beautify-description, description->package): New procedures.
(table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove
proceduces.
(latest-release): Use parsed DESCRIPTION instead of SXML.
* tests/cran.scm: Rewrite to match importer.
2 files changed, 209 insertions(+), 245 deletions(-)

M guix/import/cran.scm
M tests/cran.scm
M guix/import/cran.scm => guix/import/cran.scm +130 -135
@@ 20,26 20,26 @@
(define-module (guix import cran)
  #:use-module (ice-9 match)
  #:use-module (ice-9 regex)
  #:use-module ((ice-9 rdelim) #:select (read-string))
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-26)
  #:use-module (sxml simple)
  #:use-module (sxml match)
  #:use-module (sxml xpath)
  #:use-module (guix http-client)
  #:use-module (guix hash)
  #:use-module (guix store)
  #:use-module (guix base32)
  #:use-module ((guix download) #:select (download-to-store))
  #:use-module (guix import utils)
  #:use-module ((guix build-system r) #:select (cran-uri))
  #:use-module (guix upstream)
  #:use-module (guix packages)
  #:use-module (gnu packages)
  #:export (cran->guix-package
            %cran-updater))

;;; Commentary:
;;;
;;; Generate a package declaration template for the latest version of an R
;;; package on CRAN, using the HTML description downloaded from
;;; package on CRAN, using the DESCRIPTION file downloaded from
;;; cran.r-project.org.
;;;
;;; Code:


@@ 67,6 67,31 @@
   ((lst ...) `(list ,@(map string->license lst)))
   (_ #f)))


(define (description->alist description)
  "Convert a DESCRIPTION string into an alist."
  (let ((lines (string-split description #\newline))
        (parse (lambda (line acc)
                 (if (string-null? line) acc
                     ;; Keys usually start with a capital letter and end with
                     ;; ":".  There are some exceptions, unfortunately (such
                     ;; as "biocViews").  There are no blanks in a key.
                     (if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line)
                         ;; New key/value pair
                         (let* ((pos   (string-index line #\:))
                                (key   (string-take line pos))
                                (value (string-drop line (+ 1 pos))))
                           (cons (cons key
                                       (string-trim-both value))
                                 acc))
                         ;; This is a continuation of the previous pair
                         (match-let ((((key . value) . rest) acc))
                           (cons (cons key (string-join
                                            (list value
                                                  (string-trim-both line))))
                                 rest)))))))
    (fold parse '() lines)))

(define (format-inputs names)
  "Generate a sorted list of package inputs from a list of package NAMES."
  (map (lambda (name)


@@ 82,125 107,94 @@ package definition."
    ((package-inputs ...)
     `((,type (,'quasiquote ,(format-inputs package-inputs)))))))

(define (table-datum tree label)
  "Extract the datum node following a LABEL in the sxml table TREE.  Only the
first cell of a table row is considered a label cell."
  ((node-pos 1)
   ((sxpath `(xhtml:tr
              (xhtml:td 1)        ; only first cell can contain label
              (equal? ,label)
              ,(node-parent tree) ; go up to label cell
              ,(node-parent tree) ; go up to matching row
              (xhtml:td 2)))      ; select second cell
    tree)))

(define %cran-url "http://cran.r-project.org/web/packages/")

(define (cran-fetch name)
  "Return an sxml representation of the CRAN page for the R package NAME,
or #f on failure.  NAME is case-sensitive."
  "Return an alist of the contents of the DESCRIPTION file for the R package
NAME, or #f on failure.  NAME is case-sensitive."
  ;; This API always returns the latest release of the module.
  (let ((cran-url (string-append %cran-url name "/")))
    (false-if-exception
     (xml->sxml (http-fetch cran-url)
                #:trim-whitespace? #t
                #:namespaces '((xhtml . "http://www.w3.org/1999/xhtml"))
                #:default-entity-handler
                (lambda (port name)
                  (case name
                    ((nbsp) " ")
                    ((ge) ">=")
                    ((gt) ">")
                    ((lt) "<")
                    (else
                     (format (current-warning-port)
                             "~a:~a:~a: undefined entitity: ~a\n"
                             cran-url (port-line port) (port-column port)
                             name)
                     (symbol->string name))))))))

(define (downloads->url downloads)
  "Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the
download URL."
  (string-append "mirror://cran/"
                 ;; Remove double dots, because we want an
                 ;; absolute path.
                 (regexp-substitute/global
                  #f "\\.\\./"
                  (string-join ((sxpath '((xhtml:a 1) @ href *text*))
                                (table-datum downloads " Package source: ")))
                  'pre 'post)))

(define (nodes->text nodeset)
  "Return the concatenation of the text nodes among NODESET."
  (string-join ((sxpath '(// *text*)) nodeset) " "))

(define (cran-sxml->sexp sxml)
  "Return the `package' s-expression for a CRAN package from the SXML
representation of the package page."
  (let ((url (string-append %cran-url name "/DESCRIPTION")))
    (description->alist (read-string (http-fetch url)))))

(define (listify meta field)
  "Look up FIELD in the alist META.  If FIELD contains a comma-separated
string, turn it into a list and strip off parenthetic expressions.  Return the
empty list when the FIELD cannot be found."
  (let ((value (assoc-ref meta field)))
    (if (not value)
        '()
        ;; Strip off parentheses
        (let ((items (string-split (regexp-substitute/global
                                    #f "( *\\([^\\)]+\\)) *"
                                    value 'pre 'post)
                                   #\,)))
          ;; When there is whitespace inside of items it is probably because
          ;; this was not an actual list to begin with.
          (remove (cut string-any char-set:whitespace <>)
                  (map string-trim-both items))))))

(define (beautify-description description)
  "Improve the package DESCRIPTION by turning a beginning sentence fragment
into a proper sentence and by using two spaces between sentences."
  (let ((cleaned (if (string-prefix? "A " description)
                     (string-append "This package provides a"
                                    (substring description 1))
                     description)))
    ;; Use double spacing between sentences
    (regexp-substitute/global #f "\\. \\b"
                              cleaned 'pre ".  " 'post)))

(define (description->package meta)
  "Return the `package' s-expression for a CRAN package from the alist META,
which was derived from the R package's DESCRIPTION file."
  (define (guix-name name)
    (if (string-prefix? "r-" name)
        (string-downcase name)
        (string-append "r-" (string-downcase name))))

  (sxml-match-let*
   (((*TOP* (xhtml:html
             ,head
             (xhtml:body
              (xhtml:h2 ,name-and-synopsis)
              (xhtml:p ,description)
              ,summary
              (xhtml:h4 "Downloads:") ,downloads
              . ,rest)))
     sxml))
   (let* ((name       (match:prefix (string-match ": " name-and-synopsis)))
          (synopsis   (match:suffix (string-match ": " name-and-synopsis)))
          (version    (nodes->text (table-datum summary "Version:")))
          (license    ((compose string->license nodes->text)
                       (table-datum summary "License:")))
          (home-page  (nodes->text ((sxpath '((xhtml:a 1)))
                                    (table-datum summary "URL:"))))
          (source-url (downloads->url downloads))
          (tarball    (with-store store (download-to-store store source-url)))
          (sysdepends (map match:substring
                           (list-matches
                            "[^ ]+"
                            ;; Strip off comma and parenthetical
                            ;; expressions.
                            (regexp-substitute/global
                             #f "(,|\\([^\\)]+\\))"
                             (nodes->text (table-datum summary
                                                       "SystemRequirements:"))
                             'pre 'post))))
          (imports    (map guix-name
                           ((sxpath '(// xhtml:a *text*))
                            (table-datum summary "Imports:")))))
     `(package
        (name ,(guix-name name))
        (version ,version)
        (source (origin
                  (method url-fetch)
                  (uri (cran-uri ,name version))
                  (sha256
                   (base32
                    ,(bytevector->nix-base32-string (file-sha256 tarball))))))
        (build-system r-build-system)
        ,@(maybe-inputs sysdepends)
        ,@(maybe-inputs imports 'propagated-inputs)
        (home-page ,(if (string-null? home-page)
                        (string-append %cran-url name)
                        home-page))
        (synopsis ,synopsis)
        ;; Use double spacing
        (description ,(regexp-substitute/global #f "\\. \\b" description
                                                'pre ".  " 'post))
        (license ,license)))))
  (let* ((name       (assoc-ref meta "Package"))
         (synopsis   (assoc-ref meta "Title"))
         (version    (assoc-ref meta "Version"))
         (license    (string->license (assoc-ref meta "License")))
         ;; Some packages have multiple home pages.  Some have none.
         (home-page  (match (listify meta "URL")
                       ((url rest ...) url)
                       (_ (string-append %cran-url name))))
         (source-url (match (cran-uri name version)
                       ((url rest ...) url)
                       (_ #f)))
         (tarball    (with-store store (download-to-store store source-url)))
         (sysdepends (map string-downcase (listify meta "SystemRequirements")))
         (propagate  (map guix-name (lset-union equal?
                                                (listify meta "Imports")
                                                (listify meta "LinkingTo")
                                                (delete "R"
                                                        (listify meta "Depends"))))))
    `(package
       (name ,(guix-name name))
       (version ,version)
       (source (origin
                 (method url-fetch)
                 (uri (cran-uri ,name version))
                 (sha256
                  (base32
                   ,(bytevector->nix-base32-string (file-sha256 tarball))))))
       (properties ,`(,'quasiquote ((,'upstream-name . ,name))))
       (build-system r-build-system)
       ,@(maybe-inputs sysdepends)
       ,@(maybe-inputs propagate 'propagated-inputs)
       (home-page ,(if (string-null? home-page)
                       (string-append %cran-url name)
                       home-page))
       (synopsis ,synopsis)
       (description ,(beautify-description (assoc-ref meta "Description")))
       (license ,license))))

(define (cran->guix-package package-name)
  "Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the
`package' s-expression corresponding to that package, or #f on failure."
  (let ((module-meta (cran-fetch package-name)))
    (and=> module-meta cran-sxml->sexp)))
    (and=> module-meta description->package)))


;;;


@@ 209,32 203,33 @@ representation of the package page."

(define (latest-release package)
  "Return an <upstream-source> for the latest release of PACKAGE."
  (define name
    (if (string-prefix? "r-" package)
        (string-drop package 2)
        package))

  (define sxml
    (cran-fetch name))

  (and sxml
       (sxml-match-let*
        (((*TOP* (xhtml:html
                  ,head
                  (xhtml:body
                   (xhtml:h2 ,name-and-synopsis)
                   (xhtml:p ,description)
                   ,summary
                   (xhtml:h4 "Downloads:") ,downloads
                   . ,rest)))
          sxml))
        (let ((version (nodes->text (table-datum summary "Version:")))
              (url     (downloads->url downloads)))
          ;; CRAN does not provide signatures.
          (upstream-source
           (package package)
           (version version)
           (urls (list url)))))))

  (define (package->cran-name package)
    (match (package-source package)
      ((? origin? origin)
       (match (origin-uri origin)
         ((url rest ...)
          (let ((end   (string-rindex url #\_))
                (start (string-rindex url #\/)))
            ;; The URL ends on
            ;; (string-append "/" name "_" version ".tar.gz")
            (substring url start end)))
         (_ #f)))
    (_ #f)))

  (define cran-name
    (package->cran-name (specification->package package)))

  (define meta
    (cran-fetch cran-name))

  (and meta
       (let ((version (assoc-ref meta "Version")))
         ;; CRAN does not provide signatures.
         (upstream-source
          (package package)
          (version version)
          (urls (cran-uri cran-name version))))))

(define (cran-package? package)
  "Return true if PACKAGE is an R package from CRAN."

M tests/cran.scm => tests/cran.scm +79 -110
@@ 19,120 19,84 @@
(define-module (test-cran)
  #:use-module (guix import cran)
  #:use-module (guix tests)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-64)
  #:use-module (srfi srfi-26)
  #:use-module (ice-9 match))

(define sxml
  '(*TOP* (xhtml:html
           (xhtml:head
            (xhtml:title "CRAN - Package my-example-sxml"))
           (xhtml:body
            (xhtml:h2 "my-example-sxml: Short description")
            (xhtml:p "Long description")
            (xhtml:table
             (@ (summary "Package my-example-sxml summary"))
             (xhtml:tr
              (xhtml:td "Version:")
              (xhtml:td "1.2.3"))
             (xhtml:tr
              (xhtml:td "Depends:")
              (xhtml:td "R (>= 3.1.0)"))
             (xhtml:tr
              (xhtml:td "SystemRequirements:")
              (xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)"))
             (xhtml:tr
              (xhtml:td "Imports:")
              (xhtml:td
               (xhtml:a (@ (href "../scales/index.html"))
                        "scales")
               " (>= 0.2.3), "
               (xhtml:a (@ (href "../proto/index.html"))
                        "proto")
               ", "
               (xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp")
               " (>= 0.11.0)"))
             (xhtml:tr
              (xhtml:td "Suggests:")
              (xhtml:td
               (xhtml:a (@ (href "../some/index.html"))
                        "some")
               ", "
               (xhtml:a (@ (href "../suggestions/index.html"))
                        "suggestions")))
             (xhtml:tr
              (xhtml:td "License:")
              (xhtml:td
               (xhtml:a (@ (href "../../licenses/MIT")) "MIT")))
             (xhtml:tr
              (xhtml:td "URL:")
              (xhtml:td
               (xhtml:a (@ (href "http://gnu.org/s/my-example-sxml"))
                        "http://gnu.org/s/my-example-sxml")
               ", "
               (xhtml:a (@ (href "http://alternative/home/page"))
                        "http://alternative/home/page"))))
            (xhtml:h4 "Downloads:")
            (xhtml:table
             (@ (summary "Package my-example-sxml downloads"))
             (xhtml:tr
              (xhtml:td " Reference manual: ")
              (xhtml:td
               (xhtml:a (@ (href "my-example-sxml.pdf"))
                        " my-example-sxml.pdf ")))
             (xhtml:tr
              (xhtml:td " Package source: ")
              (xhtml:td
               (xhtml:a
                (@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz"))
                " my-example-sxml_1.2.3.tar.gz "))))
            (xhtml:h4 "Reverse dependencies:")
            (xhtml:table
             (@ (summary "Package my-example-sxml reverse dependencies"))
             (xhtml:tr
              (xhtml:td "Reverse depends:")
              (xhtml:td "Too many."))
             (xhtml:tr
              (xhtml:td "Reverse imports:")
              (xhtml:td "Likewise."))
             (xhtml:tr
              (xhtml:td "Reverse suggests:")
              (xhtml:td "Uncountable.")))))))
(define description "
Package: My-Example
Type: Package
Title: Example package
Version: 1.2.3
Date: 2015-12-10
Author: Ricardo Wurmus
Maintainer: Guix Schmeeks <guix@gnu.org>
URL: http://gnu.org/s/my-example
Description: This is a long description
spanning multiple lines: and it could confuse the parser that
there is a colon : on the lines.
  And: this line continues the description.
biocViews: 0
SystemRequirements: Cairo (>= 0)
Depends: A C++11 compiler. Version 4.6.* of g++ (as
	currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or
	later will be fine.
License: GPL (>= 3)
Imports: Rcpp (>= 0.11.5), proto, Scales
LinkingTo: Rcpp, BH
NeedsCompilation: yes
Repository: CRAN
Date/Publication: 2015-07-14 14:15:16
")

(define simple-table
  '(xhtml:table
    (xhtml:tr
     (xhtml:td "Numbers")
     (xhtml:td "123"))
    (xhtml:tr
     (@ (class "whatever"))
     (xhtml:td (@ (class "unimportant")) "Letters")
     (xhtml:td "abc"))
    (xhtml:tr
     (xhtml:td "Letters")
     (xhtml:td "xyz"))
    (xhtml:tr
     (xhtml:td "Single"))
    (xhtml:tr
     (xhtml:td "not a value")
     (xhtml:td "not a label")
     (xhtml:td "also not a label"))))
(define description-alist
  ((@@ (guix import cran) description->alist) description))

(define simple-alist
  '(("Key"        . "Value")
    ("SimpleList" . "R, Rcpp, something, whatever")
    ("BadList"    . "This is not a real list, you know?")
    ("List"       . "R (>= 2.2), BH (for no reason), GenomicRanges")))

(test-begin "cran")

(test-equal "table-datum: return list of first table cell matching label"
  '((xhtml:td "abc"))
  ((@@ (guix import cran) table-datum) simple-table "Letters"))
(test-assert "description->alist: contains all valid keys"
  (let ((keys '("Package" "Type" "Title" "Version" "Date"
                "Author" "Maintainer" "URL" "Description"
                "SystemRequirements" "Depends" "License"
                "Imports" "biocViews" "LinkingTo"
                "NeedsCompilation" "Repository"
                "Date/Publication")))
    (lset= string=? keys (map car description-alist))))

(test-equal "table-datum: return empty list if no match"
(test-equal "listify: return empty list if key cannot be found"
  '()
  ((@@ (guix import cran) table-datum) simple-table "Astronauts"))
  ((@@ (guix import cran) listify) simple-alist "Letters"))

(test-equal "listify: split comma-separated value into elements"
  '("R" "Rcpp" "something" "whatever")
  ((@@ (guix import cran) listify) simple-alist "SimpleList"))

(test-equal "table-datum: only consider the first cell as a label cell"
(test-equal "listify: strip off parentheses"
  '("R" "BH" "GenomicRanges")
  ((@@ (guix import cran) listify) simple-alist "List"))

(test-equal "listify: ignore values that are no lists"
  '()
  ((@@ (guix import cran) table-datum) simple-table "not a label"))
  ((@@ (guix import cran) listify) simple-alist "BadList"))

(test-equal "beautify-description: use double spacing"
  "This is a package.  It is great.  Trust me Mr.  Hendrix."
  ((@@ (guix import cran) beautify-description)
   "This is a package. It is great. Trust me Mr. Hendrix."))

(test-equal "beautify-description: transform fragment into sentence"
  "This package provides a function to establish world peace"
  ((@@ (guix import cran) beautify-description)
   "A function to establish world peace"))

(test-assert "cran-sxml->sexp"
(test-assert "description->package"
  ;; Replace network resources with sample data.
  (mock ((guix build download) url-fetch
         (lambda* (url file-name #:key (mirrors '()))


@@ 140,32 104,37 @@
             (lambda ()
               (display
                (match url
                  ("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz"
                  ("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz"
                   "source")
                  (_ (error "Unexpected URL: " url))))))))
    (match ((@@ (guix import cran) cran-sxml->sexp) sxml)
    (match ((@@ (guix import cran) description->package) description-alist)
      (('package
         ('name "r-my-example-sxml")
         ('name "r-my-example")
         ('version "1.2.3")
         ('source ('origin
                    ('method 'url-fetch)
                    ('uri ('cran-uri "my-example-sxml" 'version))
                    ('uri ('cran-uri "My-Example" 'version))
                    ('sha256
                     ('base32
                      (? string? hash)))))
         ('properties ('quasiquote (('upstream-name . "My-Example"))))
         ('build-system 'r-build-system)
         ('inputs
          ('quasiquote
           (("cairo" ('unquote 'cairo)))))
         ('propagated-inputs
          ('quasiquote
           (("r-proto" ('unquote 'r-proto))
           (("r-bh" ('unquote 'r-bh))
            ("r-proto" ('unquote 'r-proto))
            ("r-rcpp" ('unquote 'r-rcpp))
            ("r-scales" ('unquote 'r-scales)))))
         ('home-page "http://gnu.org/s/my-example-sxml")
         ('synopsis "Short description")
         ('description "Long description")
         ('license 'x11)))
         ('home-page "http://gnu.org/s/my-example")
         ('synopsis "Example package")
         ('description
          "This is a long description spanning multiple lines: \
and it could confuse the parser that there is a colon : on the \
lines.  And: this line continues the description.")
         ('license 'gpl3+)))
      (x
       (begin
         (format #t "~s\n" x)