@inproceedings{b0e25e6be9c14cfb831b5181c0cb6cb5,
title = "Distributed entity resolution based on similarity join for large-scale data clustering",
abstract = "Entity resolution has been widely used in data mining applications to find similar records. However, the increasing scale and complexity of data has restricted the performance of entity resolution. In this paper, we propose a novel entity resolution framework that clusters large-scale data with distributed entity resolution method. We model the clustering problem as finding similarity sub connected graphs from records. Firstly, our approach finds pairs of records whose similarities are above a given threshold based on appjoin algorithm which extends the ppjoin algorithm and are executed on MapReduce framework. Then, we propose a cache-based algorithm which cluster entities with similar pairs based on the Disjoin Set algorithm and are also designed for MapReduce framework. Experimental results on real dataset show that our algorithms can achieve more efficiency than previous algorithms on the entity resolution and clustering.",
author = "Tiezheng Nie and Lee, \{Wang Chien\} and Derong Shen and Ge Yu and Yue Kou",
year = "2014",
doi = "10.1007/978-3-319-08010-9\_16",
language = "English (US)",
isbn = "9783319080093",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "138--149",
booktitle = "Web-Age Information Management - 15th International Conference, WAIM 2014, Proceedings",
address = "Germany",
note = "15th International Conference on Web-Age Information Management, WAIM 2014 ; Conference date: 16-06-2014 Through 18-06-2014",
}