Author: ianb
Date: Mon Jun 25 21:01:53 2007
New Revision: 44525
Added:
lxml/branch/html/src/lxml/html/_diffcommand.py
Modified:
lxml/branch/html/src/lxml/html/diff.py
Log:
Added the start of a diffing command-line
Added: lxml/branch/html/src/lxml/html/_diffcommand.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/_diffcommand.py Mon Jun 25 21:01:53 2007
@@ -0,0 +1,87 @@
+import optparse
+import sys
+import re
+import os
+from lxml.html.diff import htmldiff
+
+description = """\
+"""
+
+parser = optparse.OptionParser(
+ usage="%prog [OPTIONS] FILE1 FILE2\n"
+ "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
+ description=description,
+ )
+
+parser.add_option(
+ '-o', '--output',
+ metavar="FILE",
+ dest="output",
+ default="-",
+ help="File to write the difference to",
+ )
+
+parser.add_option(
+ '-a', '--annotation',
+ action="store_true",
+ dest="annotation",
+ help="Do an annotation")
+
+def main(args=None):
+ if args is None:
+ args = sys.argv[1:]
+ options, args = parser.parse_args(args)
+ if options.annotation:
+ return annotate(options, args)
+ if len(args) != 2:
+ print 'Error: you must give two files'
+ parser.print_help()
+ sys.exit(1)
+ file1, file2 = args
+ input1 = read_file(file1)
+ input2 = read_file(file2)
+ body1 = split_body(input1)[1]
+ pre, body2, post = split_body(input2)
+ result = htmldiff(body1, body2)
+ result = pre + result + post
+ if options.output == '-':
+ if not result.endswith('\n'):
+ result += '\n'
+ sys.stdout.write(result)
+ else:
+ f = open(options.output, 'wb')
+ f.write(result)
+ f.close()
+
+def read_file(filename):
+ if filename == '-':
+ c = sys.stdin.read()
+ elif not os.path.exists(filename):
+ raise OSError(
+ "Input file %s does not exist" % filename)
+ else:
+ f = open(filename, 'rb')
+ c = f.read()
+ f.close()
+ return c
+
+body_start_re = re.compile(
+ r"<body.*?>", re.I|re.S)
+body_end_re = re.compile(
+ r"</body.*?>", re.I|re.S)
+
+def split_body(html):
+ match = body_start_re.search(html)
+ if match:
+ pre = html[:match.end()]
+ html = html[match.end():]
+ match = body_end_re.search(html)
+ if match:
+ post = html[match.start():]
+ html = html[:match.start()]
+ return pre, html, post
+
+def annotate(options, args):
+ print "Not yet implemented"
+ sys.exit(1)
+
Modified: lxml/branch/html/src/lxml/html/diff.py
==============================================================================
--- lxml/branch/html/src/lxml/html/diff.py (original)
+++ lxml/branch/html/src/lxml/html/diff.py Mon Jun 25 21:01:53 2007
@@ -865,27 +865,8 @@
return [item for item in actual
if item[2] > threshold
or not item[2]]
-
-# def get_matching_blocks(self):
-# size = min(len(self.b), len(self.b))
-# threshold = min(self.threshold, size / 4)
-# actual = difflib.SequenceMatcher.get_matching_blocks(self)
-# last_equal_a = 0
-# eliminate = []
-# for i in xrange(1, len(actual)-1):
-# start_diff_length = actual[i][0] - (actual[i-1][0] +
actual[i-1][2])
-# end_diff_length = actual[i+1][0]
-# for a_pos, b_pos, length in actual:
-# if (last_equal_a - a_pos is big
-# and length is small
-# and next_equal_a is far away):
-# continue
-# result.append((a_pos, b_pos, length))
-# last_equal_a = a_pos+length
-# return result
-
if __name__ == '__main__':
- import doctest
- doctest.testmod()
-
+ from lxml.html import _diffcommand
+ _diffcommand.main()
+
|