#!/usr/bin/tclsh # ngramwords # # Written by and copyright Neil Smith ( neil@wimp.freeuk.com ) # Version 1.0, 8 August 2005 # # A simple program that generates random words based on a sample text. # # The program reads a set of words and generates an n-gram model of the # language. It then uses that model to generate new random words. # # Many languages, when transliterated in to the Latin alphabet, use more # than one Latin letter for each native letter. To accommodate this, # this program allows each element of the n-gram to be a multi-letter # 'token.' This is done by including in the file a line such as: # ligatures= th ch รบ # (but without the leading '# ' characters) # # To use this program, ensure Tcl/Tk is installed, and make this file # executable. Then run this program. # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # 02111-1307 USA or the Free Software Foundation Europe e.V., # Villa Vogelsang, Antonienallee 1, 45279 Essen, Germany # # For more details, see http://www.gnu.org/ # For the text of the GPL, see http://www.gnu.org/licenses/gpl.txt # The regular expression to be used to separate words into tokens. # For example, if the matchExpr is "[a-z]|th|ch", the letter pairs # "th" and "ch" will be treated as a single token each. # This expression should include accented characters and non-alphabetic # characters of the language, such as apostrophies and exclamation marks # This is set in the input file with the line 'ligatures= ...' global matchExpr set matchExpr "\[a-z\]" # The length of n-grams to use in the language model global nGramLength set nGramLength 3 # The number of words to generate global noOfWords set noOfWords 20 # The maximum length (in tokens) of words to generate # Over-long words are discared in the generation phase global maxWordLength set maxWordLength 15 # The file that contains the source corpus global inputFileName set inputFileName "stdin" # Where to send the results global outputFileName global ofid set outputFileName "stdout" # Flag whether to show the generated dictionaries global showDictionaries set showDictionaries False # If False, all words are converted to lowercase before tokenisation global caseSensitive set caseSensitive False global verbose set verbose False # The list of valid word starting n-grams global starts # The language model, as a list of key-value pairs. # The key is the n-gram (as a string); the value is a list of valid next tokens to choose from. # A space is used as the next token to signifiy the word should end. global dictionary proc showUsage {} { puts "ngramwords \[switches\] \[input file\]" puts "" puts "Uses an n-gram model to generate new words from a language" puts "" puts "Switches:" puts " -g n : Uses n as length of n-grams. Default 3" puts " -n n : Generates n words. Default 20" puts " -l n : Words are at most n letters long. Default 15" puts " -i f : Reads input from file f. Default is standard input" puts " -o f : Writes output to file f. Default is standard output" puts " -s : Show dictionary contents" puts " -c : case sensitive (defult is to convert all letters to lowercase)" puts " -v : Verbose mode" exit 0 } # Read options from the command line (not very bright) proc readCommandLine {} { global nGramLength global noOfWords global maxWordLength global outputFileName global inputFileName global outputFileName global showDictionaries global caseSensitive global verbose global argv global argc set argCount 0 while {$argCount < $argc} { switch -- [lindex $argv $argCount] { "-g" { if {$argCount + 1 < $argc} { set nGramLength [lindex $argv [expr $argCount + 1]] incr argCount } else { puts "-g must have a length" } } "-n" { if {$argCount + 1 < $argc} { set noOfWords [lindex $argv [expr $argCount + 1]] incr argCount } else { puts "-n must have a number of words to generate" } } "-l" { if {$argCount + 1 < $argc} { set maxWordLength [lindex $argv [expr $argCount + 1]] incr argCount } else { puts "-l must have a max word length" } } "-i" { if {$argCount + 1 < $argc} { set inputFileName [lindex $argv [expr $argCount + 1]] incr argCount } else { puts "-i must have an input file name" } } "-o" { if {$argCount + 1 < $argc} { set outputFileName [lindex $argv [expr $argCount + 1]] incr argCount } else { puts "-o must have an output file name" } } "-s" { set showDictionaries True} "-c" { set caseSensitive True} "-v" { set verbose True} "-h" { showUsage } "--help" {showUsage } default { set inputFileName [lindex $argv $argCount] } } incr argCount } if {$verbose} { puts "Using switches:" puts " Input file = $inputFileName" puts " Output file = $outputFileName" puts " n-gram length = $nGramLength" puts " Max word length = $maxWordLength n-grams" puts " Generating $noOfWords words" puts " Showing dictionaries = $showDictionaries" puts "" } } # Read the 'ligatures=' line and build a new matchExpr proc makeMatchExpr {ligLine} { # remove the ligatures= from the start of the line, trim whitespace, and split on whitespace set ligatures [split [string trim [string range $ligLine 10 [string length $ligLine]]]] set matchEx "\[a-z\]" foreach ligature $ligatures { set matchEx "$matchEx|$ligature" } return $matchEx } # Split a word into its tokens proc tokeniseWord {word} { global matchExpr global caseSensitive set currentIndex 0 set tokenList {} if {!$caseSensitive} { set word [string tolower $word] } while {$currentIndex < [string length $word]} { regexp -nocase -start $currentIndex $matchExpr $word match lappend tokenList $match set currentIndex [expr $currentIndex + [string length $match]] } return $tokenList } # Add the initial n-gram of the word to the list of valid word starts proc addWordToStarts {tokens} { global starts global nGramLength # If the word is shorter than the n-gram requires, pad it with spaces if {[llength $tokens] < $nGramLength} { for {set i [llength $tokens]} {$i < $nGramLength} {incr i} {lappend tokens " "} } lappend starts [lrange $tokens 0 [expr $nGramLength - 1]] } # For each n-gram in the word, add that n-gram (and its successor) to the dictionary proc addWordToDictionary {tokens} { global dictionary global nGramLength if {[llength $tokens] < $nGramLength} { for {set i [llength $tokens]} {$i < $nGramLength} {incr i} {lappend tokens " "} } else { while {[llength $tokens] > $nGramLength} { set tokenString [join [lrange $tokens 0 [expr $nGramLength - 1]] ""] lappend dictionary($tokenString) [lindex $tokens $nGramLength] set tokens [lrange $tokens 1 [llength $tokens]] } } set tokenString [join [lrange $tokens 0 [expr $nGramLength - 1]] ""] lappend dictionary($tokenString) " " } # Make a word, based on the language model proc makeWord {} { global starts global dictionary global nGramLength global maxWordLength # Pretend that we've just generated an over-long word set wordSoFar [split [string repeat "aa" $maxWordLength] ""] # While we have an over-long word... while {[llength $wordSoFar] > [expr $maxWordLength + 1]} { # Pick a new starting n-gram set wordSoFar [lindex $starts [expr (int (rand() * [llength $starts]))]] # While we haven't got a space as the next token, AND the word isn't too long while { (![string equal [lrange $wordSoFar end end] "{ }"]) && ([llength $wordSoFar] < $maxWordLength) } { # Generate the key for the current trailing n tokens set dictList $dictionary([join [lrange $wordSoFar [expr [llength $wordSoFar] - $nGramLength] end] ""]) # Randomly find the next token in the word set wordSoFar [lappend wordSoFar [lindex $dictList [expr (int (rand() * [llength $dictList]))]]] } } return [string trim [join $wordSoFar ""]] } # Show the dictionaries proc showDictionaries {} { global dictionary global starts global ofid puts $ofid "Starts = $starts" puts $ofid "" puts $ofid "Dictionary:" foreach key [lsort [array names dictionary]] { puts $ofid "$key -> $dictionary($key)" } puts $ofid "" } ## ## Start of the 'main' procedure ## readCommandLine # Read the file (or use stdin if none specified) if {![string equal $inputFileName "stdin"]} { set ifid [open $inputFileName r] } else { set ifid stdin } # Read the entire file set content [read $ifid] if {![string equal $inputFileName "stdin"]} { close $ifid } # Split the contents into lines set lines [split $content "\n"] # Iterate over the lines foreach line $lines { set line [string trim $line] # Ignore comment lines if {[string index $line 0] != "#"} { # Line is a new match expression, process it specially if {[string first "ligatures=" $line] != -1} { if {$verbose} {puts "$line"} set matchExpr [makeMatchExpr $line] } else { # Split into fields on whitespace set words [split [string trim $line]] foreach word $words { set tokens [tokeniseWord $word] addWordToStarts $tokens addWordToDictionary $tokens } } } } if {![string equal $outputFileName "stdout"]} { set ofid [open $outputFileName w] } else { set ofid stdout } if {$showDictionaries} {showDictionaries} for {set i 0} {$i < $noOfWords} {incr i} { set thisWord [makeWord] puts $ofid $thisWord } if {![string equal $outputFileName "stdout"]} { close $ofid } exit 0