@inproceedings{c2cef66d1bd54f42b61a6daaf814a6d2,
title = "CiteSeerX-2018: A Cleansed Multidisciplinary Scholarly Big Dataset",
abstract = "We report the preliminary work on cleansing and classifying a scholarly big dataset containing 10+ million academic documents released by CiteSeerX. We design novel approaches to match paper entities in CiteSeerX to reference datasets, including DBLP, Web of Science, and Medline, resulting in 4.2M unique matches, whose metadata can be cleansed. We also investigate traditional machine learning and neural network methods to classify abstracts into 6 subject categories. The classification results reveal that the current CiteSeerX dataset is highly multidisciplinary, containing papers well beyond computer and information sciences.",
author = "Jian Wu and Bharath Kandimalla and Shaurya Rohatgi and Athar Sefid and Jianyu Mao and Giles, {C. Lee}",
note = "Funding Information: This project is partially supported by NSF. Funding Information: IV. CONCLUSIONS Using a combination of machine learning and information retrieval methods, we cleansed metadata of 4.2M academic documents in CiteSeerX, and designed a model to classify academic documents into 6 SCs. Future could investigate the characteristics of the remaining 5.8M unmatched documents and classify the entire dataset into 252 SCs using the best trained model on the WoS. One could also investigate the approach of assigning multiple SCs to a document. This project is partially supported by NSF. Publisher Copyright: {\textcopyright} 2018 IEEE.; 2018 IEEE International Conference on Big Data, Big Data 2018 ; Conference date: 10-12-2018 Through 13-12-2018",
year = "2018",
month = jul,
day = "2",
doi = "10.1109/BigData.2018.8622114",
language = "English (US)",
series = "Proceedings - 2018 IEEE International Conference on Big Data, Big Data 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "5465--5467",
editor = "Naoki Abe and Huan Liu and Calton Pu and Xiaohua Hu and Nesreen Ahmed and Mu Qiao and Yang Song and Donald Kossmann and Bing Liu and Kisung Lee and Jiliang Tang and Jingrui He and Jeffrey Saltz",
booktitle = "Proceedings - 2018 IEEE International Conference on Big Data, Big Data 2018",
address = "United States",
}