Return to Snippet

Revision: 46269
at May 18, 2011 01:43 by mjaniec


Updated Code
library(lsa)

# create some files
td = tempfile()
dir.create(td)
write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/"))
write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/"))
write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/"))
write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/"))

# read files into a document-term matrix
myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku

myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix)

summary(myMatrix)

# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_raw())

# display it as a textmatrix again
round(as.textmatrix(myLSAspace),2) # should give the original

# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_share())

# display it as a textmatrix again
myNewMatrix = as.textmatrix(myLSAspace)
myNewMatrix # should look be different!

# compare two terms with the cosine measure
cosine(myNewMatrix["dog",], myNewMatrix["cat",])

# calc associations for mouse
associate(myNewMatrix, "mouse")

# demonstrate generation of a query
query("monster", rownames(myNewMatrix))
query("monster dog", rownames(myNewMatrix)) 

# compare two documents with pearson
cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson")

# clean up
unlink(td, recursive=TRUE)

# [---]

# LSA search
q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace

qd <- 0
for (i in 1:ncol(myNewMatrix)) {

	qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))

}

#---

# sk - stala na potrzeby Query

sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
	for (i in 1:length(myLSAspace$sk)) {
		sk[i,i] <- myLSAspace$sk[i]
	}

QueryVector <- function(p) {

	q <- query(p, rownames(myNewMatrix))	

	t(q) %*% myLSAspace$tk %*% solve(sk)
	
}

SearchPhraseOld <- function(p) { # zwraca miary dla wszystkich dokumentow

	q <- fold_in(query(p, rownames(myNewMatrix)),myLSAspace)

	qd <- 0
	for (i in 1:ncol(myNewMatrix)) {

		qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))

	}

	qd

}

SearchPhrase <- function(p) {

	q <- QueryVector(p)

	qd <- 0
	for (i in 1:nrow(myLSAspace$dk)) {

		qd[i] <- cosine(as.vector(q),as.vector(myLSAspace$dk[i,]))

	}

	qd


}

# porownanie correlation i cosine distance dla search:

cor(q,myNewMatrix,method="spearman")

qd

# compare two phrases

ComparePhrasesOld <- function(p1, p2) {

	q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace)
	q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace)

	cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor

}


ComparePhrases <- function(p1, p2) {

	q1 <- QueryVector(p1)
	q2 <- QueryVector(p2)

	cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor

}

ComparePhrases("cat dog","monster dog")

ComparePhrases("sushi hamster","monster dog")

#---

q <- query("sushi hamster", rownames(myNewMatrix))

sk <- matrix(0,length(myLSAspace$sk),length(myLSAspace$sk))
for (i in 1:length(myLSAspace$sk)) {
	sk[i,i] <- myLSAspace$sk[i]
}

t(q) %*% myLSAspace$tk %*% solve(sk)

Revision: 46268
at May 15, 2011 23:13 by mjaniec


Initial Code
library(lsa)

# create some files
td = tempfile()
dir.create(td)
write( c("dog", "cat", "mouse"), file=paste(td, "D1", sep="/"))
write( c("hamster", "mouse", "sushi"), file=paste(td, "D2", sep="/"))
write( c("dog", "monster", "monster"), file=paste(td, "D3", sep="/"))
write( c("dog", "mouse", "dog"), file=paste(td, "D4", sep="/"))

# read files into a document-term matrix
myMatrix = textmatrix(td, minWordLength=1) # textvector dla jednego pliku

myMatrix = lw_bintf(myMatrix) * gw_idf(myMatrix)

summary(myMatrix)

# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_raw())

# display it as a textmatrix again
round(as.textmatrix(myLSAspace),2) # should give the original

# create the latent semantic space
myLSAspace = lsa(myMatrix, dims=dimcalc_share())

# display it as a textmatrix again
myNewMatrix = as.textmatrix(myLSAspace)
myNewMatrix # should look be different!

# compare two terms with the cosine measure
cosine(myNewMatrix["dog",], myNewMatrix["cat",])

# calc associations for mouse
associate(myNewMatrix, "mouse")

# demonstrate generation of a query
query("monster", rownames(myNewMatrix))
query("monster dog", rownames(myNewMatrix)) 

# compare two documents with pearson
cor(myNewMatrix[,1], myNewMatrix[,2], method="pearson")

# clean up
unlink(td, recursive=TRUE)

# [---]

# LSA search
q <- fold_in(query("sushi hamster", rownames(myNewMatrix)),myLSAspace) # query <> LSAspace

qd <- 0
for (i in 1:ncol(myNewMatrix)) {

	qd[i] <- cosine(as.vector(q),as.vector(myNewMatrix[,i]))

}

# porownanie correlation i cosine distance dla search:

cor(q,myNewMatrix,method="spearman")

qd

# compare two phrases

ComparePhrases <- function(p1, p2) {

	q1 <- fold_in(query(p1, rownames(myNewMatrix)),myLSAspace)
	q2 <- fold_in(query(p2, rownames(myNewMatrix)),myLSAspace)

	cosine(as.vector(q1),as.vector(q2)) # alternatywnie mozna zastosowac cor

}

ComparePhrases("cat dog","monster dog")

ComparePhrases("sushi hamster","monster dog")

Initial URL
http://mjaniec.blogspot.com/2011/05/testing-lsa-in-r.html

Initial Description


Initial Title
Testing LSA in R

Initial Tags


Initial Language
R