You are on page 1of 2

import os, sys import numpy as np import matplotlib.pyplot as plt import string import re from counter.

counter import Counter def minstensx(word, n): return (len(word) >= n) def hoogstensx(word, n): return (len(word) <= n) def preciesx(word, n): return (len(word) == n) X = "" THEMAX = 0 M = THEMAX def minstens(word): return minstensx(word, M) def hoogstens(word): return hoogstensx(word, M) def precies(word): return preciesx(word, M) def plotwords(tb, title, author, w): N = THEMAX ind = np.arange(N) width = 0.1 p1 = plt.bar(ind, w, width, color='r') plt.xlabel('Lengte woord') plt.ylabel('Aantal voorkomens') plt.title('Bestand:' + tb + ', Totaal:' + "%s" % (sum(w))) for i in range(len(title)): y = 1 - (i+1) * 0.05 print "i=%d y=%d" % (i, y) plt.annotate(title[i], xy=(1, y), xycoords='axes fraction', ha='right') plt.annotate(author[0], xy=(1, y-0.05), xycoords='axes fraction', ha='right') plt.show() def dorank1(f, tb, d, e): global M words = string.split(open(tb).read()) s = len(words) print s rest = words for x in range(M, 0, -1): fw = filter(f, rest) fw.sort() l = len(fw) d.insert(0, l) e.insert(0, (float(l)/float(s))*10000) rest = list(set(rest) - set(fw)) M=M-1 def findlongest(a, b): global THEMAX global X la = len(a); lb = len(b) if la > lb: if la > THEMAX: THEMAX = la X=a elif lb > THEMAX: THEMAX = lb X=b return cmp(a, b) def getsubsetoftext(tekst): print type(tekst), len(tekst), tekst[0:50] w1 = re.search("START OF (THE)|(THIS) PROJECT GUTENBERG EBOOK", tekst) if not w1 == None: w2 = tekst[w1.start():] print w1.start(), w1.end(), w2[0:50] else: w2 = tekst w3 = re.search("END OF (THE)|(THIS) PROJECT GUTENBERG EBOOK", w2) if not w3 == None: w4 = w2[:w3.end()] print w3.start(), w3.end(), w4[0:50]

else: w4 = w2 def dorank2(f, tb, d, e): global M tekst = open(tb).read() words = string.split(tekst) s = len(words) print s rest = words rest.sort(findlongest) print "langste=%d s=%s" % (THEMAX, X) for M in range(1, THEMAX+1, 1): fw = filter(f, rest) fw.sort() l = len(fw) d.append(l) #e.insert(0, (float(l)/float(s))*10000) c1 = Counter(rest) print c1.most_common(10) c2 = Counter(fw) c3 = c1 - c2 rest = list(c3.elements()) def getbookinfo(tb): dotitle = doauthor = False title = []; author = [] f = open(tb) lines = f.readlines() for l in lines: fw = '' if not l.split() == []: fw = l.split()[0] if fw == 'Title:': dotitle = True; doauthor = False if fw == 'Author:': dotitle = False; doauthor = True if dotitle: s = l.strip(' \r\n'); if not s == '': title.append(s) if doauthor: s = l.strip(' \r\n'); if not s == '': author.append(s); break return (title, author) print sys.argv nargs = len(sys.argv) #print nargs if not nargs == 2: print "gebruik: woorden.py tekstbestand" exit(1) tb = sys.argv[1] (title, author) = getbookinfo(tb) print title, author d = []; e = [] dorank2(precies, tb, d, e) print "sum(d)=%d len(d)=%d d=%s" % (sum(d), len(d), d) #print "e=%s" % (e) plotwords(tb, title, author, d)

You might also like