#!/usr/bin/env python
"""
A 'grep' for HTML/XML documents that uses xpath syntax.
Usage: xpathgrep.py xpath_expr [filename ...]
"""
import sys
from lxml import etree
def grep(expr, filename):
try:
if not filename:
filename = ''
htmldata = sys.stdin.read()
else:
htmldata = file(filename).read()
doc = etree.HTML(htmldata)
except Exception, e:
print >> sys.stderr, 'xpathgrep: %s: %s' % (filename, e)
return
try:
results = doc.xpath(expr)
except Exception, e:
print >> sys.stderr, 'xpathgrep: %s' % e
sys.exit(1) # no point in continuing if the expr has a syntax error
if results:
print '%s: %s matches' % (filename, len(results))
for node in results:
print ' * ' + str(node).replace('\n', '\n ')
def main():
if len(sys.argv) < 2:
print >> sys.stderr, __doc__.strip()
sys.exit(1)
expr = sys.argv[1]
filenames = sys.argv[2:]
if not filenames:
filenames = [None] # read from sys.stdin
for filename in filenames:
grep(expr, filename)
if __name__ == '__main__':
main()