新幹線でのひまつぶし。
はてなダイアリーの記事は生テキスト、MT(Movable Type)、CSVなどの形式で
ダウンロード出来る。
MT(Movable Type)形式は次のようになっている。
AUTHOR: seinzumtode TITLE: Mobile端末用のネイティブアプリケーション化フレームワーク STATUS: Publish ALLOW COMMENTS: 1 CONVERT BREAKS: 0 ALLOW PINGS: 1 DATE: 01/06/2012 01:26:01 AM CATEGORY: JavaScript ----- BODY: <div class="section"> <p>マルチプラットフォームに対応したネイティブアプリ化のフレームワークが流行って\ るらしい。<br>PhoneGapをインストールしてみたけど、Titaniumのインストールに比べ\\ たら全然楽だった。<br>あと注目なのがRhodes。<br>(インストールとチュートリアル\\ )<br><a href="http://fujitaiju.com/blog/ruby/rubyiosrhodes%E3%82%92%E3%82%A4%E\ 3%83%B3%E3%82%B9%E3%83%88%E3%83%BC%E3%83%AB%E3%81%97%E3%81%A6iphone%E3%82%B7%E3\ %83%9F%E3%83%A5%E3%83%AC%E3%83%BC%E3%82%BF%E3%82%92%E8%B5%B7%E5%8B%95%E3%81%97/\ " target="_blank">http://fujitaiju.com/blog/ruby/rubyiosrhodes%E3%82%92%E3%82%A\ 4%E3%83%B3%E3%82%B9%E3%83%88%E3%83%BC%E3%83%AB%E3%81%97%E3%81%A6iphone%E3%82%B7\ %E3%83%9F%E3%83%A5%E3%83%AC%E3%83%BC%E3%82%BF%E3%82%92%E8%B5%B7%E5%8B%95%E3%81%\ 97/</a><br><a href="http://ngmy-lpk.blogspot.com/2011/04/rubyiphonerhodes.html"\ target="_blank">http://ngmy-lpk.blogspot.com/2011/04/rubyiphonerhodes.html</a>\ <br>Railsで書いてるアプリをrhodesでもう一度書きなおしてネイティブアプリ化すると\ か。<br>Javascriptがすごい重要になってきてるように感じる。</p> </div> ----- EXTENDED BODY: ----- EXCERPT:
まずAUTHOR: seinzumtodeから次のAUTHOR: seinzumtodeまでを
bodyとして抽出するスクリプトを書く。
コードブロック内にglobalが入り乱れてるあたり、
インスタンスつくったほうが実装としては綺麗になると思う。
#/usr/bin/env python #-*- coding:utf-8 -*- import MySQLdb global connector connector = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") global cursor cursor = connector.cursor() #initialization try: sql = "drop table articles" cursor.execute(sql) except Exception: pass sql = "create table articles(title varchar(255),body text)" cursor.execute(sql) fin = open("seinzumtode.txt") global startFlag global body global title startFlag =0 body = "" title = "" def searchMarker(line): if line.find("AUTHOR: seinzumtode"): global startFlag global title title = line.split(':')[1] startFlag = 1 def appendBody(line): #print line.find("AUTHOR: seinzumtode") if line.find("AUTHOR: seinzumtode") == 0: global startFlag global body global title commitTransaction(title,body) startFlag = 0 clearBody() clearTitle() else: global body body += line def clearBody(): global body body = "" def clearTitle(): global title title = "" def commitTransaction(title,body): try: sql = "insert into articles values(%s,%s)" #print "sql committed" #global cursor global connector cursor = connector.cursor() cursor.execute(sql,(title,body)) connector.commit() print "committed" except Exception: pass for line in fin: global startFlag if startFlag == 0: searchMarker(line) elif startFlag == 1: appendBody(line) cursor.close() connector.close()
次にカテゴリなどのヘッダ的な部分を削除する。
import MySQLdb connector = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") cursor = connector.cursor() try: sql = "drop table articles_modified" cursor.execute(sql) except Exception: pass sql = "create table articles_modified(title varchar(255),body text)" cursor.execute(sql) sql = "select * from articles" cursor.execute(sql) connector2 = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") cursor2 = connector2.cursor() for c in cursor: title = c[0].encode("utf8") body = c[1].encode("utf8") #print body print body.find("BODY:") if body.find("BODY:") > 0: body = body.split("BODY:")[1] #print body #try: sql = "insert into articles_modified values(%s,%s)" #global connector2 #global cursor2 cursor2.execute(sql,(title,body)) connector2.commit() #except Exception: # pass cursor.close() cursor2.close() connector.close() connector2.close()
import re from lxml import etree import MySQLdb connector = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") cursor = connector.cursor() connector2 = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") cursor2 = connector2.cursor() try: sql = "drop table articles_tagdeleted" cursor.execute(sql) except Exception: pass sql = "create table articles_tagdeleted(title varchar(255),body text)" cursor.execute(sql) sql = "select * from articles_modified" cursor.execute(sql) for c in cursor: title = c[0].encode("utf8") body = c[1].encode("utf8") text = re.sub('<[^<]+>', "",body) text = re.sub('EXTENDED',"",body) #tree = etree.parse(body) #tree = etree.fromstring(body,parser=utf8_parser) #tree = etree.HTML(body, parser=utf8_parser) #body_notags = etree.tostring(tree,encoding="utf8",method="text") sql = "insert into articles_tagdeleted values(%s,%s)" cursor2.execute(sql,(title,text)) connector2.commit() cursor.close() cursor2.close() connector.close() connector2.close()
TF-IDFを計算する。
#!/bin/env python #coding:utf-8 import MeCab,re from xml.dom.minidom import parse from math import log import MySQLdb def getNoun(words): noun = [] tagger = MeCab.Tagger( "-Ochasen" ) node = tagger.parseToNode( words.encode( "utf-8" ) ) while node: if node.feature.split(",")[0] == "名詞": #if node.feature.split(",")[0] == "noun": replace_node = re.sub( re.compile( "[!-/:-@[-`{-~]" ), "", node.surface ) if replace_node != "" and replace_node != " ": noun.append( replace_node ) node = node.next return noun def getTopKeywords(TF,n): list = sorted( TF.items(), key=lambda x:x[1], reverse=True ) return list[0:n] def calcTFIDF( N,TF, DF ): tfidf = TF * log( N / ( DF *1.0) ) return tfidf connector = MySQLdb.connect(host="localhost",db="hatenadiary",user="dbuser",passwd="dbpasswd",charset="utf8") cursor = connector.cursor() sql = "select * from articles_tagdeleted" cursor.execute(sql) if __name__ == "__main__": N = 841 tf = {} df = {} #dom = parse( "jawiki-latest-pages-articles.xml" ) #text = dom.getElementsByTagName( "text" ) #for i, text in enumerate( text ): for c in cursor: df_list = [] #noun = getNoun( text.childNodes[0].data ) body = c[1]#.encode("utf8") noun = getNoun( body ) for word in noun: try: tf[word] = tf[word] + 1 except KeyError: tf[word] = 1 for word in noun: try: if word in df_list: continue df[word] = df[word] + 1 except KeyError: df[word] = 1 tfidf = {} for k,v in getTopKeywords( tf, 1000 ): tfidf[k] = calcTFIDF(N,tf[k],df[k]) for k,v in getTopKeywords( tfidf, 1000): print k,v
実行結果は以下。上位100件を出してみる。
>hatenadiary python tfidf.py h 309.38636837 ? 309.382410564 print 309.368006754 title 309.293693892 in 309.293693892 42 309.271899962 ( 309.092732872 st 309.092732872 if 308.999105234 X 308.894496928 1 308.432522777 u 308.399206716 2 308.315347795 images 307.737811866 void 307.725075635 digitalWrite 307.629040599 w 307.623395993 import 307.300898181 ak 306.765819263 cdn 306.62779542 0 306.038386006 value 305.897548515 305.556754928 n 305.455546268 g 305.040771953 for 304.988543732 to 304.10337781 highlight 303.980360879 bin 303.802802104 303.067552114 38 303.067552114 syntax 303.065612597 t 303.065612597 10 302.848402624 tex 302.397990514 mimetex 301.931318062 y 300.948656791 A 300.599156136 c 298.258269693 41 290.693840432 5 290.063558696 int 286.085749151 40 279.117075809 278.728078969 ? 273.473713676 8 265.150064374 4 263.302665564 option 262.835734536 cgi 260.947808308 alt 257.566781502 www 253.584012994 html 251.031304376 img 241.445219412 image 238.114506925 P 221.092599131 td 213.613808122 src 209.136782613 ? 193.846981835 s 158.433880872 id 154.611676576 synPreProc 117.723600805 d 101.609758881 i 88.5392767056 fotolife 60.5790756482 3 48.4833105344 section 27.5286274956 ne 25.5938905866 x -29.4943499578 -67.4495673394 synSpecial -186.128698197 seinzumtode -231.829994896 blank -345.474991657 target -366.439877245 com -394.292182939 jp -505.177676612 -510.791600426 f -519.236264281 href -522.057473024 div -722.560851831 pre -785.223328455 2 -847.477414006 hatena -994.699504412 synComment -1017.58076135 60 -1140.54343437 synType -1317.1334031 1 -1488.92593482 62 -1557.61492055 0 -1663.91247984 synIdentifier -1717.70713412 synStatement -2135.20505733 -2584.54836528 http -3041.46663637 a -3396.45881278 p -5474.09355027 synConstant -6608.95531372 34 -6783.34732784 -7435.33706101 br -12773.8578183 class -45829.4410734 span -94651.5642078
ストップワード入れてないのでゴミが多い。
digitalWriteがTF-IDFが高いのは面白いと思った。