Toggle menu
Toggle personal menu
Not logged in
Your IP address will be publicly visible if you make any edits.

데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준: Difference between revisions

From ZeroWiki
imported>teledong
No edit summary
imported>teledong
No edit summary
Line 1: Line 1:
Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here
Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here
Train.java
package org.zeropage.machinelearn;
  import java.io.*;
  import java.io.*;
  import java.util.*;
  import java.util.*;
   
   
  public class Native {
  class Train {
  public static void main(String[] args) {
private Map<String,Integer> economyWord;
  int economyNum = 0;
private Map<String,Integer> politicWord;
  int politicNum = 0;
private int economyNum;
  Map<String,Integer> economyWord = new HashMap<String,Integer>();
private int politicNum;
  Map<String,Integer> politicWord = new HashMap<String,Integer>();
private boolean isSkipData(String inputStr) { // 자신의 사이트, 블로그, 페이지 주소의 경우 연관성이 떨어지므로 검색에서 제외
BufferedReader economyLearn;
if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
BufferedReader politicLearn;
inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") ||
inputStr.equals("//") || inputStr.equals("블로그")) {
return true;
}
else { return false; }
}
  public void TrainData() {
  this.economyNum = 0;
  this.politicNum = 0;
  this.economyWord = new HashMap<String,Integer>();
  this.politicWord = new HashMap<String,Integer>();
  try {
  try {
  economyLearn = new BufferedReader(new FileReader(new File("C:/Users/KDJ/Documents/Devils/svm_data.tar/package/train/economy/index.economy.db")));
  Scanner economyLearn = new Scanner(new File("svm_data.tar/package/train/economy/index.economy.db"));
  politicLearn = new BufferedReader(new FileReader(new File("C:/Users/KDJ/Documents/Devils/svm_data.tar/package/train/politics/index.politics.db")));
  while(economyLearn.hasNextLine()) {
String tmp = economyLearn.readLine();
  String[] a = economyLearn.nextLine().split(" ");
while(tmp != null) {
  String[] a = tmp.split(" ");
  for(String wordTmp:a) {
  for(String wordTmp:a) {
  if(wordTmp.length() == 1) {continue; }
  if(isSkipData(wordTmp)) {continue;}
  if( economyWord.get(wordTmp) == null) {
  if( this.economyWord.get(wordTmp) == null) {
  economyNum++;
  this.economyNum++;
  economyWord.put(wordTmp, 1);
  this.economyWord.put(wordTmp, 1);
  }
  }
  else { economyWord.put(wordTmp, economyWord.get(wordTmp)+1); }
  else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); }
  }
  }
tmp = economyLearn.readLine();
  }
  }
  tmp = politicLearn.readLine();
  economyLearn.close();
  while(tmp != null) {
Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db"));
  String[] a = tmp.split(" ");
  while(politicLearn.hasNextLine()) {
  String[] a = politicLearn.nextLine().split(" ");
  for(String wordTmp:a) {
  for(String wordTmp:a) {
  if(wordTmp.length() == 1) {continue; }
  if(isSkipData(wordTmp)) {continue; }
  if (politicWord.get(wordTmp) == null ) {
  if (this.politicWord.get(wordTmp) == null ) {
  politicNum++;
  this.politicNum++;
  politicWord.put(wordTmp, 1);
  this.politicWord.put(wordTmp, 1);
  }
  }
  else { politicWord.put(wordTmp, politicWord.get(wordTmp)+1); }
  else { this.politicWord.put(wordTmp, this.politicWord.get(wordTmp)+1); }
  }
  }
tmp = politicLearn.readLine();
  }
  }
politicLearn.close();
  } catch (FileNotFoundException e) {
  } catch (FileNotFoundException e) {
  // TODO Auto-generated catch block
  // TODO Auto-generated catch block
Line 47: Line 59:
  e.printStackTrace();
  e.printStackTrace();
  }
  }
}
public HashMap<String,Integer> getEconomyData() {
return (HashMap<String, Integer>) this.economyWord;
}
public HashMap<String,Integer> getPoliticData() {
return (HashMap<String, Integer>) this.politicWord;
}
public int getEconomyNumber() {
return this.economyNum;
}
public int getPoliticNumber() {
return this.politicNum;
}
}
Analyzer.java
package org.zeropage.machinelearn;
import java.util.*;
import java.io.*;
public class Analyzer {
private static HashMap<String,Integer> ecoData;
private static HashMap<String,Integer> polData;
private static Train machineTrain;
private static double DocumentResult(File f, boolean isEconomy) {
double negaNum = 0;
double posiNum = 0;
double ecoResultNum = 0;
double polResultNum = 0;
double reslt = 0;
  try {
  try {
  double ecoResultNum = 0;
  Scanner targetDocument = new Scanner(f);
double polResultNum = 0;
  while(targetDocument.hasNextLine()) {
double posiNum = 0;
  String[] a = targetDocument.nextLine().split(" ");
double negaNum = 0;
  for(String wordTmp:a) {
double reslt = Math.log((double)economyNum/politicNum);
  if(ecoData.get(wordTmp) == null) { ecoResultNum = 0; }
BufferedReader targetDocument = new BufferedReader(new FileReader(new File("C:/Users/KDJ/Documents/Devils/svm_data.tar/package/test/economy/economy.txt")));
  else { ecoResultNum = ecoData.get(wordTmp); }
  String tmp = targetDocument.readLine();
  if(polData.get(wordTmp) == null) { polResultNum = 0; }
while(tmp != null) {
  else { polResultNum = polData.get(wordTmp); }
  String[] str = tmp.split(" ");
ecoResultNum += 1;
  for(String tmpStr:str) {
  polResultNum += 1;
  if(economyWord.get(tmpStr) == null) { ecoResultNum = 0; }
  if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과
  else { ecoResultNum = economyWord.get(tmpStr); }
if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과
  if(politicWord.get(tmpStr) == null) { polResultNum = 0; }
  if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); }
  else { polResultNum = politicWord.get(tmpStr); }
else { reslt += Math.log(polResultNum / ecoResultNum); }
  polResultNum+=1;
  ecoResultNum+=1;
  reslt += Math.log(ecoResultNum / polResultNum);
  }
  }
  if(reslt < 0) { negaNum+= 1; }
  if(reslt < 0) { negaNum+= 1; }
  else { posiNum += 1; }
  else { posiNum += 1; }
  reslt = 0;
  reslt = 0;
tmp = targetDocument.readLine();
  }
  }
double accu = (posiNum / (posiNum+negaNum));
  targetDocument.close();
  targetDocument.close();
posiNum = 0;
negaNum = 0;
polResultNum = 0;
ecoResultNum = 0;
reslt = Math.log((double)economyNum/politicNum);
targetDocument = new BufferedReader(new FileReader(new File("C:/Users/KDJ/Documents/Devils/svm_data.tar/package/test/politics/politics.txt")));
tmp = targetDocument.readLine();
while(tmp != null) {
String[] str = tmp.split(" ");
for(String tmpStr:str) {
if(economyWord.get(tmpStr) == null) { ecoResultNum = 0; }
else { ecoResultNum = economyWord.get(tmpStr); }
if(politicWord.get(tmpStr) == null) { polResultNum = 0; }
else { polResultNum = politicWord.get(tmpStr); }
polResultNum+=1;
ecoResultNum+=1;
reslt += Math.log(polResultNum / ecoResultNum);
}
if(reslt < 0) { negaNum+= 1; }
else { posiNum += 1; }
reslt = 0;
tmp = targetDocument.readLine();
}
double accu2 = ((double)posiNum / (posiNum+negaNum));
System.out.println((accu + accu2) / 2);
  } catch (FileNotFoundException e) {
  } catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
  // TODO Auto-generated catch block
  // TODO Auto-generated catch block
  e.printStackTrace();
  e.printStackTrace();
  }
  }
return posiNum / (posiNum+negaNum);
}
public static void Init() {
machineTrain = new Train();
machineTrain.TrainData();
ecoData = machineTrain.getEconomyData();
polData = machineTrain.getPoliticData();
}
public static void main(String[] args) {
Init();
double result1 = DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), true);
System.out.println(result1);
double result2 = DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), false);
System.out.println(result2);
System.out.println((result1 + result2) / 2);
  }
  }
  }
  }
나름 88%의 적중률 ㅋㅋㅋㅋㅋ
Train 의 Economy.txt 파일 적중도 : 0.995 (99.5%)
P.S : 조만간 발적화 코드를 바꿀예정
Train Politics.txt 파일 적중도 : 0.96 (96%)
전체 평균 적중도 : 0.9775 (97.75%)
위의 주석처럼 약간의 Advantage 와 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다.
결과를 볼 수 있었으면 좋겠네요 ^^;;



Revision as of 05:24, 29 June 2011

Describe 데블스캠프2011/둘째날/Machine-Learning/NaiveBayesClassifier/김동준 here Train.java

package org.zeropage.machinelearn;

import java.io.*;
import java.util.*;

class Train {
	private Map<String,Integer> economyWord;
	private Map<String,Integer> politicWord;
	private int economyNum;
	private int politicNum;
	private boolean isSkipData(String inputStr) { // 자신의 사이트, 블로그, 페이지 주소의 경우 연관성이 떨어지므로 검색에서 제외
		if(inputStr.length() == 1 || inputStr.equals("http") || inputStr.equals("blog") || inputStr.equals("com") ||
			inputStr.equals("naver") || inputStr.equals("empas") || inputStr.equals("daum") || inputStr.equals("yahoo") ||
			inputStr.equals("tistory") || inputStr.equals("co") || inputStr.equals("kr") || inputStr.equals("www") || inputStr.equals("ohmynews") || 
			inputStr.equals("//") || inputStr.equals("블로그")) { 
			return true;
		}
		else { return false; }
	}
	public void TrainData() {
		this.economyNum = 0;
		this.politicNum = 0;
		this.economyWord = new HashMap<String,Integer>();
		this.politicWord = new HashMap<String,Integer>();
		try {
			Scanner economyLearn = new Scanner(new File("svm_data.tar/package/train/economy/index.economy.db"));
			while(economyLearn.hasNextLine()) {
				String[] a = economyLearn.nextLine().split(" ");
				for(String wordTmp:a) {
					if(isSkipData(wordTmp)) {continue;}
					if( this.economyWord.get(wordTmp) == null) {
						this.economyNum++;
						this.economyWord.put(wordTmp, 1);
					}
					else { this.economyWord.put(wordTmp, this.economyWord.get(wordTmp)+1); }
				}
			}
			economyLearn.close();
			Scanner politicLearn = new Scanner(new File("svm_data.tar/package/train/politics/index.politics.db"));
			while(politicLearn.hasNextLine()) {
				String[] a = politicLearn.nextLine().split(" ");
				for(String wordTmp:a) {
					if(isSkipData(wordTmp)) {continue; }
					if (this.politicWord.get(wordTmp) == null ) {
						this.politicNum++;
						this.politicWord.put(wordTmp, 1);
					}
					else { this.politicWord.put(wordTmp, this.politicWord.get(wordTmp)+1); }
				}
			}
			politicLearn.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	public HashMap<String,Integer> getEconomyData() {
		return (HashMap<String, Integer>) this.economyWord;
	}
	public HashMap<String,Integer> getPoliticData() {
		return (HashMap<String, Integer>) this.politicWord;
	}
	public int getEconomyNumber() {
		return this.economyNum;
	}
	public int getPoliticNumber() {
		return this.politicNum;
	}
}

Analyzer.java

package org.zeropage.machinelearn;

import java.util.*;
import java.io.*;

public class Analyzer {
	private static HashMap<String,Integer> ecoData;
	private static HashMap<String,Integer> polData;
	private static Train machineTrain;
	
	private static double DocumentResult(File f, boolean isEconomy) {
		double negaNum = 0;
		double posiNum = 0;
		double ecoResultNum = 0;
		double polResultNum = 0;
		double reslt = 0;
		try {
			Scanner targetDocument = new Scanner(f);
			while(targetDocument.hasNextLine()) {
				String[] a = targetDocument.nextLine().split(" ");
				for(String wordTmp:a) {
					if(ecoData.get(wordTmp) == null) { ecoResultNum = 0; }
					else { ecoResultNum = ecoData.get(wordTmp); }
					if(polData.get(wordTmp) == null) { polResultNum = 0; }
					else { polResultNum = polData.get(wordTmp); }
					ecoResultNum += 1;
					polResultNum += 1;
					if(isEconomy && polData.get(wordTmp) == null) { polResultNum -= 0.5; } // 경제파트이면서 정치쪽에 없는 단어에 Advantage 부과
					if(!isEconomy && ecoData.get(wordTmp) == null) { ecoResultNum -= 0.5; } // 정치파트이면서 경제쪽에 없는 단어에 Adventage 부과
					if(isEconomy) { reslt += Math.log(ecoResultNum / polResultNum); }
					else { reslt += Math.log(polResultNum / ecoResultNum); }
				}
				if(reslt < 0) { negaNum+= 1; }
				else { posiNum += 1; }
				reslt = 0;
			}
			targetDocument.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return posiNum / (posiNum+negaNum);
	}
	public static void Init() {
		machineTrain = new Train();
		machineTrain.TrainData();
		ecoData = machineTrain.getEconomyData();
		polData = machineTrain.getPoliticData();
	}
	public static void main(String[] args) {
		Init();
		double result1 = DocumentResult(new File("svm_data.tar/package/test/economy/economy.txt"), true);
		System.out.println(result1);
		double result2 = DocumentResult(new File("svm_data.tar/package/test/politics/politics.txt"), false);
		System.out.println(result2);
		System.out.println((result1 + result2) / 2);
	}
}

Train 의 Economy.txt 파일 적중도 : 0.995 (99.5%) Train 의 Politics.txt 파일 적중도 : 0.96 (96%) 전체 평균 적중도 : 0.9775 (97.75%) 위의 주석처럼 약간의 Advantage 와 필요없는 (http, //, blog, yahoo, empas, tistory 같은) 단어를 제외하고 작성할 수 있게 수정했습니다. 이 결과를 볼 수 있었으면 좋겠네요 ^^;;