如何为统一差异语法编写解析器

问题描述:

我应该使用RegexParsers,StandardTokenParsers还是适合于解析这种类型的语法?语法示例可以从here找到。如何为统一差异语法编写解析器

这是使用RegexParsers的解决方案。

import scala.util.parsing.combinator.RegexParsers 

object UnifiedDiffParser extends RegexParsers { 

    // case classes representing the data of the diff 
    case class UnifiedDiff(oldFile: File, newFile: File, changeChunks: List[ChangeChunk]) 
    case class File(name: String, timeStamp: String) 
    case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[String]) 
    case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int) 

    override def skipWhitespace = false 

    def unifiedDiff: Parser[UnifiedDiff] = oldFile ~ newFile ~ rep1(changeChunk) ^^ { 
    case of ~ nf ~ l => UnifiedDiff(of, nf, l) 
    } 

    def oldFile: Parser[File] = ("--- " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ { 
    case f~t => File(f, t) 
    } 
    def newFile: Parser[File] = ("+++ " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ { 
    case f~t => File(f, t) 
    } 
    def filename: Parser[String] = """[\S]+""".r 
    def timestamp: Parser[String] = """.*""".r 

    def changeChunk: Parser[ChangeChunk] = rangeInformation ~ (newline ~> rep1(lineChange)) ^^ { 
    case ri ~ l => ChangeChunk(ri, l) 
    } 
    def rangeInformation: Parser[RangeInformation] = ("@@ " ~> "-" ~> number) ~ ("," ~> number) ~ (" +" ~> number) ~ ("," ~> number) <~ " @@" ^^ { 
    case a ~ b ~ c ~ d => RangeInformation(a, b, c, d) 
    } 

    def lineChange: Parser[String] = contextLine | addedLine | deletedLine 
    def contextLine: Parser[String] = """ .*""".r <~ newline 
    def addedLine: Parser[String] = """\+.*""".r <~ newline 
    def deletedLine: Parser[String] = """-.*""".r <~ newline 

    def newline: Parser[String] = """\n""".r 
    def number: Parser[Int] = """\d+""".r ^^ {_.toInt} 

    def main(args: Array[String]) { 
    val reader = { 
     if (args.length == 0) { 
     // read from stdin 
     Console.in 
     } else { 
     new java.io.FileReader(args(0)) 
     } 
    } 
    println(parseAll(unifiedDiff, reader)) 
    } 
} 
+0

谢谢,我正在寻找! – JtR 2010-08-25 13:33:07

此格式设计为易于解析,您可以在没有任何正则表达式的情况下执行此操作,也不需要标记输入。只要一行一行地看看第一对字符。文件头和块头将需要更多的注意力,但这不是分割时无法做到的。

当然,如果你想学习如何使用一些解析库,那就去吧。

+2

几年前,在我编程的第一周,我开始为Nethack创建补丁。不知道'diff',我开始用手写这些该死的东西。当新闻组中的某个人礼貌地告诉我我可能不在我的脑海时,你可以想象我的尴尬。好吧,无论如何,统一差异不仅容易解析,而且手写也不难。 :) – guns 2010-08-24 19:25:07

+0

我不想明确保持自己的状态和学习使用解析器组合器也是我的目标之一。在大多数例子中,有一些编程语言需要解析的语法,但我想知道解析器组合器是否也可用于解析diff语法甚至二进制格式。 – JtR 2010-08-24 20:25:16

我会使用正则表达式。它简化了一些事情,并使其余的标准。

def process(src: scala.io.Source) { 
    import scala.util.matching.Regex 

    val FilePattern = """(.*) ''(.*)''""" 
    val OriginalFile = new Regex("--- "+FilePattern, "path", "timestamp") 
    val NewFile = new Regex("+++ "+FilePattern, "path", "timestamp") 
    val Chunk = new Regex("""@@ -(\d+),(\d+) +(\d+),(\d+) @@""", "orgStarting", "orgSize", "newStarting", "newSize") 
    val AddedLine = """+(.*)""".r 
    val RemovedLine = """-(.*)""".r 
    val UnchangedLine = """ (.*)""".r 

    src.getLines() foreach { 
    case OriginalFile(path, timestamp) => println("Original file: "+path) 
    case NewFile(path, timestamp) => println("New file: "+path) 
    case Chunk(l1, s1, l2, s2) => println("Modifying %d lines at line %d, to %d lines at %d" format (s1, l1, s2, l2)) 
    case AddedLine(line) => println("Adding line "+line) 
    case RemovedLine(line) => println("Removing line "+line) 
    case UnchangedLine(line) => println("Keeping line "+line) 
    } 
} 
+0

我一直希望能够使用解析器组合器来摆脱自己的状态。我从修补程序详细信息中构建一个对象图,就像我有一个包含FileModifications的Patch包含Chunks。看起来,解析器组合器可以提供更简单的方法来创建解析事物的对象,而不是在途中的某些变量上构建对象图并跟踪解析状态。 – JtR 2010-08-24 20:22:52

+0

顺便说一句,我不知道这样的正则表达式可以在模式匹配中使用,非常整齐! – JtR 2010-08-24 20:26:01

偶然发现此而希望建立一个Scala的解析器,用于一个git的差异,通过运行git diff-tree作为生成。这与统一差异非常相似,但确实有一些有趣的变体。

我严重依赖上面的答案,并最终编写了包含在这里的解析器。当然,这并不完全是原始的海报,但我认为它可能对其他人有用。

import util.parsing.combinator._ 

object GitDiff { 
    // file names have "a/" or "b/" as prefix, need to drop that to compare 
    def apply (files: (String,String), op: FileOperation, chunks: List[ChangeChunk]) = { 
    def strip(s: String) = s.dropWhile(_ != '/').drop(1) 
    new GitDiff(strip(files._1), strip(files._2), op, chunks) 
    } 
} 

case class GitDiff(oldFile: String, newFile: String, op: FileOperation, chunks: List[ChangeChunk]) { 
    val isRename = oldFile != newFile 
} 

sealed trait FileOperation 
case class NewFile(mode: Int) extends FileOperation 
case class DeletedFile(mode: Int) extends FileOperation 
case object UpdatedFile extends FileOperation 

sealed trait LineChange { def line: String } 
case class ContextLine(line: String) extends LineChange 
case class LineRemoved(line: String) extends LineChange 
case class LineAdded(line: String) extends LineChange 
case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int) 
case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[LineChange]) 

// Code taken from http://stackoverflow.com/questions/3560073/how-to-write-parser-for-unified-diff-syntax 
object GitDiffParser extends RegexParsers { 

    override def skipWhitespace = false 

    def allDiffs: Parser[List[GitDiff]] = rep1(gitDiff) 

    def gitDiff: Parser[GitDiff] = filesChanged ~ fileOperation ~ diffChunks ^^ { 
    case files ~ op ~ chunks => GitDiff(files, op, chunks) 
    } 

    def filesChanged: Parser[(String, String)] = 
    "diff --git " ~> filename ~ (" " ~> filename) <~ newline ^^ { case f1 ~ f2 => (f1,f2) } 

    def fileOperation: Parser[FileOperation] = 
    opt(deletedFileMode | newFileMode) <~ index ^^ { _ getOrElse UpdatedFile } 

    def index: Parser[Any] = ("index " ~ hash ~ ".." ~ hash) ~> opt(" " ~> mode) <~ newline 
    def deletedFileMode: Parser[DeletedFile] = "deleted file mode " ~> mode <~ newline ^^ { m => DeletedFile(m) } 
    def newFileMode: Parser[NewFile] = "new file mode " ~> mode <~ newline ^^ { m => NewFile(m) } 
    def hash: Parser[String] = """[0-9a-f]{7}""".r 
    def mode: Parser[Int] = """\d{6}""".r ^^ { _.toInt } 

    def diffChunks: Parser[List[ChangeChunk]] = (oldFile ~ newFile) ~> rep1(changeChunk) 

    def oldFile: Parser[String] = "--- " ~> filename <~ newline 
    def newFile: Parser[String] = "+++ " ~> filename <~ newline 
    def filename: Parser[String] = """[\S]+""".r 

    def changeChunk: Parser[ChangeChunk] = rangeInformation ~ opt(contextLine) ~ (opt(newline) ~> rep1(lineChange)) ^^ { 
    case ri ~ opCtx ~ lines => ChangeChunk(ri, opCtx map (_ :: lines) getOrElse (lines)) 
    } 
    def rangeInformation: Parser[RangeInformation] = 
    ("@@ " ~> "-" ~> number) ~ opt("," ~> number) ~ (" +" ~> number) ~ opt("," ~> number) <~ " @@" ^^ { 
     case a ~ b ~ c ~ d => RangeInformation(a, b getOrElse 0, c, d getOrElse 0) 
    } 

    def lineChange: Parser[LineChange] = contextLine | addedLine | deletedLine 
    def contextLine: Parser[ContextLine] = " " ~> """.*""".r <~ newline ^^ { l => ContextLine(l) } 
    def addedLine: Parser[LineAdded] = "+" ~> """.*""".r <~ newline ^^ { l => LineAdded(l) } 
    def deletedLine: Parser[LineRemoved] = "-" ~> """.*""".r <~ newline ^^ { l => LineRemoved(l) } 

    def newline: Parser[String] = """\n""".r 
    def number: Parser[Int] = """\d+""".r ^^ { _.toInt } 

    def parse(str: String) = parseAll(allDiffs, str) 

    def main(args: Array[String]) { 
    val reader = { 
     if (args.length == 0) { 
     // read from stdin 
     Console.in 
     } else { 
     new java.io.FileReader(args(0)) 
     } 
    } 
    parseAll(allDiffs, reader) match { 
     case Success(s,_) => println(s) 
     case NoSuccess(msg,_) => sys.error("ERROR: " + msg) 
    } 
    } 
}