logo       
Google Custom Search
    AddThis Social Bookmark Button
-->

r44525 - lxml/branch/html/src/lxml/html: msg#00168

Subject: r44525 - lxml/branch/html/src/lxml/html
Author: ianb
Date: Mon Jun 25 21:01:53 2007
New Revision: 44525

Added:
   lxml/branch/html/src/lxml/html/_diffcommand.py
Modified:
   lxml/branch/html/src/lxml/html/diff.py
Log:
Added the start of a diffing command-line

Added: lxml/branch/html/src/lxml/html/_diffcommand.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/_diffcommand.py      Mon Jun 25 21:01:53 2007
@@ -0,0 +1,87 @@
+import optparse
+import sys
+import re
+import os
+from lxml.html.diff import htmldiff
+
+description = """\
+"""
+
+parser = optparse.OptionParser(
+    usage="%prog [OPTIONS] FILE1 FILE2\n"
+    "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
+    description=description,
+    )
+
+parser.add_option(
+    '-o', '--output',
+    metavar="FILE",
+    dest="output",
+    default="-",
+    help="File to write the difference to",
+    )
+
+parser.add_option(
+    '-a', '--annotation',
+    action="store_true",
+    dest="annotation",
+    help="Do an annotation")
+
+def main(args=None):
+    if args is None:
+        args = sys.argv[1:]
+    options, args = parser.parse_args(args)
+    if options.annotation:
+        return annotate(options, args)
+    if len(args) != 2:
+        print 'Error: you must give two files'
+        parser.print_help()
+        sys.exit(1)
+    file1, file2 = args
+    input1 = read_file(file1)
+    input2 = read_file(file2)
+    body1 = split_body(input1)[1]
+    pre, body2, post = split_body(input2)
+    result = htmldiff(body1, body2)
+    result = pre + result + post
+    if options.output == '-':
+        if not result.endswith('\n'):
+            result += '\n'
+        sys.stdout.write(result)
+    else:
+        f = open(options.output, 'wb')
+        f.write(result)
+        f.close()
+
+def read_file(filename):
+    if filename == '-':
+        c = sys.stdin.read()
+    elif not os.path.exists(filename):
+        raise OSError(
+            "Input file %s does not exist" % filename)
+    else:
+        f = open(filename, 'rb')
+        c = f.read()
+        f.close()
+    return c
+
+body_start_re = re.compile(
+    r"<body.*?>", re.I|re.S)
+body_end_re = re.compile(
+    r"</body.*?>", re.I|re.S)
+    
+def split_body(html):
+    match = body_start_re.search(html)
+    if match:
+        pre = html[:match.end()]
+        html = html[match.end():]
+    match = body_end_re.search(html)
+    if match:
+        post = html[match.start():]
+        html = html[:match.start()]
+    return pre, html, post
+
+def annotate(options, args):
+    print "Not yet implemented"
+    sys.exit(1)
+    

Modified: lxml/branch/html/src/lxml/html/diff.py
==============================================================================
--- lxml/branch/html/src/lxml/html/diff.py      (original)
+++ lxml/branch/html/src/lxml/html/diff.py      Mon Jun 25 21:01:53 2007
@@ -865,27 +865,8 @@
         return [item for item in actual
                 if item[2] > threshold
                 or not item[2]]
-    
-# def get_matching_blocks(self):
-#         size = min(len(self.b), len(self.b))
-#         threshold = min(self.threshold, size / 4)
-#         actual = difflib.SequenceMatcher.get_matching_blocks(self)
-#         last_equal_a = 0
-#         eliminate = []
-#         for i in xrange(1, len(actual)-1):
-#             start_diff_length = actual[i][0] - (actual[i-1][0] + 
actual[i-1][2])
-#             end_diff_length = actual[i+1][0]
-#         for a_pos, b_pos, length in actual:
-#             if (last_equal_a - a_pos is big
-#                 and length is small
-#                 and next_equal_a is far away):
-#                 continue
-#             result.append((a_pos, b_pos, length))
-#             last_equal_a = a_pos+length
-#         return result
-            
 
 if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
-
+    from lxml.html import _diffcommand
+    _diffcommand.main()
+    


<Prev in Thread] Current Thread [Next in Thread>