More actions
imported>teledong No edit summary |
imported>teledong No edit summary |
||
| Line 1: | Line 1: | ||
Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here | Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here | ||
Train.java | |||
package org.zeropage.machinelearn; | |||
import java.io.*; | import java.io.*; | ||
import java.util.*; | import java.util.*; | ||
class Train { | |||
public | private Map<String,Integer> economyWord; | ||
private Map<String,Integer> politicWord; | |||
private int economyNum; | |||
private int politicNum; | |||
private boolean isSkipData(String inputStr) { // 자신의 사이트, 블로그, 페이지 주소의 경우 연관성이 떨어지므로 검색에서 제외 | |||
if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") || | |||
inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") || | |||
inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || | |||
inputStr.equals("//") || inputStr.equals("블로그")) { | |||
return true; | |||
} | |||
else { return false; } | |||
} | |||
public void TrainData() { | |||
this.economyNum = 0; | |||
this.politicNum = 0; | |||
this.economyWord = new HashMap<String,Integer>(); | |||
this.politicWord = new HashMap<String,Integer>(); | |||
try { | try { | ||
economyLearn = new | Scanner economyLearn = new Scanner(new File("svm_data.tar/package/train/economy/index.economy.db")); | ||
while(economyLearn.hasNextLine()) { | |||
String[] a = economyLearn.nextLine().split(" "); | |||
String[] a = | |||
for(String wordTmp:a) { | for(String wordTmp:a) { | ||
if(wordTmp | if(isSkipData(wordTmp)) {continue;} | ||
if( economyWord.get(wordTmp) == null) { | if( this.economyWord.get(wordTmp) == null) { | ||
economyNum++; | this.economyNum++; | ||
economyWord.put(wordTmp, 1); | this.economyWord.put(wordTmp, 1); | ||
} | } | ||
else { economyWord.put(wordTmp, economyWord.get(wordTmp)+1); } | else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); } | ||
} | } | ||
} | } | ||
economyLearn.close(); | |||
while( | Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db")); | ||
String[] a = | while(politicLearn.hasNextLine()) { | ||
String[] a = politicLearn.nextLine().split(" "); | |||
for(String wordTmp:a) { | for(String wordTmp:a) { | ||
if(wordTmp | if(isSkipData(wordTmp)) {continue; } | ||
if (politicWord.get(wordTmp) == null ) { | if (this.politicWord.get(wordTmp) == null ) { | ||
politicNum++; | this.politicNum++; | ||
politicWord.put(wordTmp, 1); | this.politicWord.put(wordTmp, 1); | ||
} | } | ||
else { politicWord.put(wordTmp, politicWord.get(wordTmp)+1); } | else { this.politicWord.put(wordTmp, this.politicWord.get(wordTmp)+1); } | ||
} | } | ||
} | } | ||
politicLearn.close(); | |||
} catch (FileNotFoundException e) { | } catch (FileNotFoundException e) { | ||
// TODO Auto-generated catch block | // TODO Auto-generated catch block | ||
| Line 47: | Line 59: | ||
e.printStackTrace(); | e.printStackTrace(); | ||
} | } | ||
} | |||
public HashMap<String,Integer> getEconomyData() { | |||
return (HashMap<String, Integer>) this.economyWord; | |||
} | |||
public HashMap<String,Integer> getPoliticData() { | |||
return (HashMap<String, Integer>) this.politicWord; | |||
} | |||
public int getEconomyNumber() { | |||
return this.economyNum; | |||
} | |||
public int getPoliticNumber() { | |||
return this.politicNum; | |||
} | |||
} | |||
Analyzer.java | |||
package org.zeropage.machinelearn; | |||
import java.util.*; | |||
import java.io.*; | |||
public class Analyzer { | |||
private static HashMap<String,Integer> ecoData; | |||
private static HashMap<String,Integer> polData; | |||
private static Train machineTrain; | |||
private static double DocumentResult(File f, boolean isEconomy) { | |||
double negaNum = 0; | |||
double posiNum = 0; | |||
double ecoResultNum = 0; | |||
double polResultNum = 0; | |||
double reslt = 0; | |||
try { | try { | ||
Scanner targetDocument = new Scanner(f); | |||
while(targetDocument.hasNextLine()) { | |||
String[] a = targetDocument.nextLine().split(" "); | |||
for(String wordTmp:a) { | |||
if(ecoData.get(wordTmp) == null) { ecoResultNum = 0; } | |||
else { ecoResultNum = ecoData.get(wordTmp); } | |||
if(polData.get(wordTmp) == null) { polResultNum = 0; } | |||
else { polResultNum = polData.get(wordTmp); } | |||
String[] | ecoResultNum += 1; | ||
for(String | polResultNum += 1; | ||
if( | if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과 | ||
else { ecoResultNum = | if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과 | ||
if( | if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); } | ||
else { polResultNum = | else { reslt += Math.log(polResultNum / ecoResultNum); } | ||
polResultNum+=1; | |||
ecoResultNum | |||
reslt += Math.log(ecoResultNum / polResultNum); | |||
} | } | ||
if(reslt < 0) { negaNum+= 1; } | if(reslt < 0) { negaNum+= 1; } | ||
else { posiNum += 1; } | else { posiNum += 1; } | ||
reslt = 0; | reslt = 0; | ||
} | } | ||
targetDocument.close(); | targetDocument.close(); | ||
} catch (FileNotFoundException e) { | } catch (FileNotFoundException e) { | ||
// TODO Auto-generated catch block | // TODO Auto-generated catch block | ||
e.printStackTrace(); | e.printStackTrace(); | ||
} | } | ||
return posiNum / (posiNum+negaNum); | |||
} | |||
public static void Init() { | |||
machineTrain = new Train(); | |||
machineTrain.TrainData(); | |||
ecoData = machineTrain.getEconomyData(); | |||
polData = machineTrain.getPoliticData(); | |||
} | |||
public static void main(String[] args) { | |||
Init(); | |||
double result1 = DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), true); | |||
System.out.println(result1); | |||
double result2 = DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), false); | |||
System.out.println(result2); | |||
System.out.println((result1 + result2) / 2); | |||
} | } | ||
} | } | ||
Train 의 Economy.txt 파일 적중도 : 0.995 (99.5%) | |||
Train 의 Politics.txt 파일 적중도 : 0.96 (96%) | |||
전체 평균 적중도 : 0.9775 (97.75%) | |||
위의 주석처럼 약간의 Advantage 와 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다. | |||
이 결과를 볼 수 있었으면 좋겠네요 ^^;; | |||
Revision as of 05:24, 29 June 2011
Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here Train.java
package org.zeropage.machinelearn;
import java.io.*;
import java.util.*;
class Train {
private Map<String,Integer> economyWord;
private Map<String,Integer> politicWord;
private int economyNum;
private int politicNum;
private boolean isSkipData(String inputStr) { // 자신의 사이트, 블로그, 페이지 주소의 경우 연관성이 떨어지므로 검색에서 제외
if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") ||
inputStr.equals("//") || inputStr.equals("블로그")) {
return true;
}
else { return false; }
}
public void TrainData() {
this.economyNum = 0;
this.politicNum = 0;
this.economyWord = new HashMap<String,Integer>();
this.politicWord = new HashMap<String,Integer>();
try {
Scanner economyLearn = new Scanner(new File("svm_data.tar/package/train/economy/index.economy.db"));
while(economyLearn.hasNextLine()) {
String[] a = economyLearn.nextLine().split(" ");
for(String wordTmp:a) {
if(isSkipData(wordTmp)) {continue;}
if( this.economyWord.get(wordTmp) == null) {
this.economyNum++;
this.economyWord.put(wordTmp, 1);
}
else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); }
}
}
economyLearn.close();
Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db"));
while(politicLearn.hasNextLine()) {
String[] a = politicLearn.nextLine().split(" ");
for(String wordTmp:a) {
if(isSkipData(wordTmp)) {continue; }
if (this.politicWord.get(wordTmp) == null ) {
this.politicNum++;
this.politicWord.put(wordTmp, 1);
}
else { this.politicWord.put(wordTmp, this.politicWord.get(wordTmp)+1); }
}
}
politicLearn.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public HashMap<String,Integer> getEconomyData() {
return (HashMap<String, Integer>) this.economyWord;
}
public HashMap<String,Integer> getPoliticData() {
return (HashMap<String, Integer>) this.politicWord;
}
public int getEconomyNumber() {
return this.economyNum;
}
public int getPoliticNumber() {
return this.politicNum;
}
}
Analyzer.java
package org.zeropage.machinelearn;
import java.util.*;
import java.io.*;
public class Analyzer {
private static HashMap<String,Integer> ecoData;
private static HashMap<String,Integer> polData;
private static Train machineTrain;
private static double DocumentResult(File f, boolean isEconomy) {
double negaNum = 0;
double posiNum = 0;
double ecoResultNum = 0;
double polResultNum = 0;
double reslt = 0;
try {
Scanner targetDocument = new Scanner(f);
while(targetDocument.hasNextLine()) {
String[] a = targetDocument.nextLine().split(" ");
for(String wordTmp:a) {
if(ecoData.get(wordTmp) == null) { ecoResultNum = 0; }
else { ecoResultNum = ecoData.get(wordTmp); }
if(polData.get(wordTmp) == null) { polResultNum = 0; }
else { polResultNum = polData.get(wordTmp); }
ecoResultNum += 1;
polResultNum += 1;
if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과
if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과
if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); }
else { reslt += Math.log(polResultNum / ecoResultNum); }
}
if(reslt < 0) { negaNum+= 1; }
else { posiNum += 1; }
reslt = 0;
}
targetDocument.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return posiNum / (posiNum+negaNum);
}
public static void Init() {
machineTrain = new Train();
machineTrain.TrainData();
ecoData = machineTrain.getEconomyData();
polData = machineTrain.getPoliticData();
}
public static void main(String[] args) {
Init();
double result1 = DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), true);
System.out.println(result1);
double result2 = DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), false);
System.out.println(result2);
System.out.println((result1 + result2) / 2);
}
}
Train 의 Economy.txt 파일 적중도 : 0.995 (99.5%) Train 의 Politics.txt 파일 적중도 : 0.96 (96%) 전체 평균 적중도 : 0.9775 (97.75%) 위의 주석처럼 약간의 Advantage 와 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다. 이 결과를 볼 수 있었으면 좋겠네요 ^^;;