mirror of
https://github.com/php/php-src.git
synced 2024-12-04 15:23:44 +08:00
154 lines
6.5 KiB
Plaintext
154 lines
6.5 KiB
Plaintext
|
|
README FOR ext/tidy by John Coggeshall <john@php.net>
|
|
|
|
Tidy Version: 0.5b
|
|
|
|
Tidy is an extension based on Libtidy (http://tidy.sf.net/) and allows a PHP developer
|
|
to clean, repair, and traverse HTML, XHTML, and XML documents -- including ones with
|
|
embedded scripting languages such as PHP or ASP within them using OO constructs.
|
|
|
|
The Tidy extension has two separate APIs, one for general parsing, cleaning, and
|
|
repairing and another for document traversal. The general API is provided below:
|
|
|
|
tidy_create() Initialize and return a tidy document resource
|
|
tidy_parse_file($tidy, $file) Parse the document stored in $file
|
|
tidy_parse_string($tidy, $str) Parse the string stored in $str
|
|
|
|
tidy_clean_repair($tidy) Clean and repair the document
|
|
tidy_diagnose($tidy) Diagnose a parsed document
|
|
|
|
tidy_setopt($tidy, $opt, $val) Set a configuration option $opt to $val
|
|
tidy_getopt($tidy, $opt) Retrieve a configuration option
|
|
|
|
** note: $opt is a string representing the option. Right now the only
|
|
source of these options is the LibTidy source.. eventually I'll document
|
|
them offically -- see the src/config.c file in the tidy source **
|
|
|
|
tidy_get_output($tidy) Return the cleaned tidy HTML as a string
|
|
tidy_get_error_buffer($tidy) Return a log of the errors and warnings
|
|
returned by tidy
|
|
|
|
tidy_get_release() Return the Libtidy release date
|
|
tidy_get_status($tidy) Return the status of the document
|
|
tidy_get_html_ver($tidy) Return the major HTML version detected for
|
|
the document;
|
|
|
|
tidy_is_xhtml($tidy) Determines if the document is XHTML
|
|
tidy_is_xml($tidy) Determines if the document is a generic XML
|
|
|
|
tidy_error_count($tidy) Returns the number of errors in the document
|
|
tidy_warning_count($tidy) Returns the number of warnings in the document
|
|
tidy_access_count($tidy) Returns the number of accessibility-related
|
|
warnings in the document.
|
|
tidy_config_count($tidy) Returns the number of configuration errors found
|
|
|
|
tidy_load_config($tidy, $file) Loads the specified configuration file
|
|
tidY_load_config_enc($tidy,
|
|
$file,
|
|
$enc) Loads the specified config file using the specified
|
|
character encoding
|
|
tidy_set_encoding($tidy, $enc) Sets the current character encoding for the document
|
|
tidy_save_config($tidy, $file) Saves the current config to $file
|
|
|
|
|
|
Beyond these general-purpose API functions, Tidy also supports the following
|
|
functions which are used to retrieve an object for document traversal:
|
|
|
|
tidy_get_root($tidy) Returns an object starting at the root of the
|
|
document
|
|
tidy_get_head($tidy) Returns an object starting at the <HEAD> tag
|
|
tidy_get_html($tidy) Returns an object starting at the <HTML> tag
|
|
tidy_get_body($tidy) Returns an object starting at the <BODY> tag
|
|
|
|
All Navigation of the specified document is done via the PHP5 object constructs.
|
|
There are two types of objects which Tidy can create. The first is TidyNode, which
|
|
represents HTML Tags, Text, and more (see the TidyNode_Type Constants). The second
|
|
is TidyAttr, which represents an attribute within an HTML tag (TidyNode). The
|
|
functionality of these objects is represented by the following schema:
|
|
|
|
class TidyNode {
|
|
|
|
public $name; // name of node (i.e. HEAD)
|
|
public $value; // value of node (everything between tags)
|
|
public $type; // type of node (text, php, asp, etc.)
|
|
public $id; // id of node (i.e. TIDY_TAG_HEAD)
|
|
|
|
public $line; // line # of node in source
|
|
public $column; // column # of node in source
|
|
|
|
public $html_ver; // HTML version (0,1,2,3,4)
|
|
|
|
public $attribs; // an array of attributes (see TidyAttr)
|
|
public $children; // an array of child nodes
|
|
|
|
function has_siblings(); // any sibling nodes?
|
|
function has_children(); // any child nodes?
|
|
function has_parent(); // have a parent?
|
|
|
|
function is_comment(); // is node a comment?
|
|
function is_xhtml(); // is document XHTML?
|
|
function is_xml(); // is document generic XML (not HTML/XHTML)
|
|
function is_text(); // is node text?
|
|
function is_html(); // is node an HTML tag?
|
|
|
|
function is_jste(); // is jste block?
|
|
function is_asp(); // is Microsoft ASP block?
|
|
function is_php(); // is PHP block?
|
|
|
|
function next(); // returns next node
|
|
function prev(); // returns prev node
|
|
function parent(); // returns parent node
|
|
function child(); // returns first child node
|
|
|
|
/* Searches for a particular attribute in the current node based
|
|
on node ID. If found returns a TidyAttr object for it */
|
|
function get_attr_type($attr_id);
|
|
|
|
/*
|
|
|
|
NOT YET IMPLEMENTED
|
|
|
|
Recursively traverses the tree from the current node and returns
|
|
an array of attributes matching the node ID/attr ID pair
|
|
|
|
Useful for pulling out things like links:
|
|
foreach($body->fetch_attrs(TIDY_TAG_A, TIDY_ATTR_HREF) as $link) {
|
|
echo "Link : {$link->value}\n";
|
|
}
|
|
*/
|
|
|
|
function fetch_attrs($node_id, $attr_id);
|
|
|
|
/*
|
|
|
|
NOT YET IMPLEMENTED
|
|
|
|
Recursively traverses the tree from the current node and returns
|
|
an array of nodes matching the node ID
|
|
|
|
Useful for pulling out tables, etc (echos the HTML for every
|
|
<TABLE> block)
|
|
|
|
foreach($body->fetch_nodes(TIDY_TAG_TABLE) as $table) {
|
|
|
|
echo $table->value;
|
|
|
|
}
|
|
*/
|
|
function fetch_nodes($node_id)
|
|
}
|
|
|
|
class TidyAttr {
|
|
|
|
public $name; // attribute name i.e. HREF
|
|
public $value; // attribute value
|
|
public $id; // attribute id i.e. TIDY_ATTR_HREF
|
|
|
|
function next(); // returns next attribute in tag
|
|
function tag(); // returns the tag node associated with attribute
|
|
}
|
|
|
|
Examples of using these objects to navigate the tree can be found in the examples/
|
|
directory (I suggest looking at urlgrab.php and dumpit.php)
|
|
|
|
E-mail thoughts, suggestions, patches, etc. to <john@php.net> |