a
     ɝh1                     @   s   d dl mZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	Z
d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ eddddd	d
d	fddZdd ZG dd deZG dd deZG dd deZG dd deZdS )    )print_functiondivisionN)	Structure)Dataset
DataLoader)default_collate)SubsetRandomSampler@   g?F   c	                 K   sb  t | }
|	d du r^|du rJ|| dk s,J d| | }td| d n|| | dks^J tt|
}|	d r||	d }nt||
 }|	d r|	d }nt||
 }|	d r|	d }nt||
 }t|d| }t|||  |  }|rt|| d }t| |||||d}t| |||||d}|rFt| |||||d}|rV|||fS ||fS dS )	a  
    Utility function for dividing a dataset to train, val, test datasets.

    !!! The dataset needs to be shuffled before using the function !!!

    Parameters
    ----------
    dataset: torch.utils.data.Dataset
      The full dataset to be divided.
    collate_fn: torch.utils.data.DataLoader
    batch_size: int
    train_ratio: float
    val_ratio: float
    test_ratio: float
    return_test: bool
      Whether to return the test dataset loader. If False, the last test_size
      data will be hidden.
    num_workers: int
    pin_memory: bool

    Returns
    -------
    train_loader: torch.utils.data.DataLoader
      DataLoader that random samples the training data.
    val_loader: torch.utils.data.DataLoader
      DataLoader that random samples the validation data.
    (test_loader): torch.utils.data.DataLoader
      DataLoader that random samples the test data, returns if
        return_test=True.
    
train_sizeNr
   zB[Warning] train_ratio is None, using 1 - val_ratio - test_ratio = z as training data.	test_sizeZval_size)
batch_sizesamplernum_workers
collate_fn
pin_memory)lenprintlistrangeintr   r   )datasetr   r   Ztrain_ratioZ	val_ratioZ
test_ratioZreturn_testr   r   kwargs
total_sizeindicesr   r   Z
valid_sizeZtrain_samplerZval_samplerZtest_samplerZtrain_loader
val_loadertest_loader r   $/var/www/html/AI4Kappa/cgcnn/data.pyget_train_val_test_loader   sX    "



r   c                 C   s   g g g   }}}g g  }}g }d}t | D ]z\}\\}	}
}}}|	jd }||	 ||
 |||  tt|| }|| || || ||7 }q*tj|ddtj|ddtj|dd|ftj|dd|fS )a  
    Collate a list of data and return a batch for predicting crystal
    properties.

    Parameters
    ----------

    dataset_list: list of tuples for each data point.
      (atom_fea, nbr_fea, nbr_fea_idx, target)

      atom_fea: torch.Tensor shape (n_i, atom_fea_len)
      nbr_fea: torch.Tensor shape (n_i, M, nbr_fea_len)
      nbr_fea_idx: torch.LongTensor shape (n_i, M)
      target: torch.Tensor shape (1, )
      cif_id: str or int

    Returns
    -------
    N = sum(n_i); N0 = sum(i)

    batch_atom_fea: torch.Tensor shape (N, orig_atom_fea_len)
      Atom features from atom type
    batch_nbr_fea: torch.Tensor shape (N, M, nbr_fea_len)
      Bond features of each atom's M neighbors
    batch_nbr_fea_idx: torch.LongTensor shape (N, M)
      Indices of M neighbors of each atom
    crystal_atom_idx: list of torch.LongTensor of length N0
      Mapping from the crystal idx to atom idx
    target: torch.Tensor shape (N, 1)
      Target value for prediction
    batch_cif_ids: list
    r   )dim)		enumerateshapeappendtorch
LongTensornparangecatstack)Zdataset_listZbatch_atom_feaZbatch_nbr_feaZbatch_nbr_fea_idxZcrystal_atom_idxZbatch_targetbatch_cif_idsZbase_idxiatom_feanbr_feanbr_fea_idxtargetcif_idn_inew_idxr   r   r   collate_poolb   s.    !







r3   c                   @   s"   e Zd ZdZdddZdd ZdS )GaussianDistancezE
    Expands the distance by Gaussian basis.

    Unit: angstrom
    Nc                 C   sF   ||k sJ || |ksJ t ||| || _|du r<|}|| _dS )z
        Parameters
        ----------

        dmin: float
          Minimum interatomic distance
        dmax: float
          Maximum interatomic distance
        step: float
          Step size for the Gaussian filter
        N)r&   r'   filtervar)selfdmindmaxstepr6   r   r   r   __init__   s    zGaussianDistance.__init__c                 C   s*   t |dt jf | j d  | jd  S )aw  
        Apply Gaussian disntance filter to a numpy distance array

        Parameters
        ----------

        distance: np.array shape n-d array
          A distance matrix of any shape

        Returns
        -------
        expanded_distance: shape (n+1)-d array
          Expanded distance matrix with the last dimension of length
          len(self.filter)
        .   )r&   expnewaxisr5   r6   )r7   	distancesr   r   r   expand   s    zGaussianDistance.expand)N)__name__
__module____qualname____doc__r;   r@   r   r   r   r   r4      s   
r4   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )AtomInitializerzz
    Base class for intializing the vector representation for atoms.

    !!! Use one AtomInitializer per dataset !!!
    c                 C   s   t || _i | _d S N)set
atom_types
_embedding)r7   rH   r   r   r   r;      s    
zAtomInitializer.__init__c                 C   s   || j v sJ | j| S rF   )rH   rI   )r7   	atom_typer   r   r   get_atom_fea   s    zAtomInitializer.get_atom_feac                 C   s0   || _ t| j  | _dd | j  D | _d S )Nc                 S   s   i | ]\}}||qS r   r   .0rJ   idxr   r   r   
<dictcomp>       z3AtomInitializer.load_state_dict.<locals>.<dictcomp>)rI   rG   keysrH   items_decodedict)r7   
state_dictr   r   r   load_state_dict   s
    zAtomInitializer.load_state_dictc                 C   s   | j S rF   )rI   r7   r   r   r   rT      s    zAtomInitializer.state_dictc                 C   s*   t | ds dd | j D | _| j| S )NrS   c                 S   s   i | ]\}}||qS r   r   rL   r   r   r   rO      rP   z*AtomInitializer.decode.<locals>.<dictcomp>)hasattrrI   rR   rS   )r7   rN   r   r   r   decode   s
    
zAtomInitializer.decodeN)	rA   rB   rC   rD   r;   rK   rU   rT   rX   r   r   r   r   rE      s   rE   c                       s    e Zd ZdZ fddZ  ZS )AtomCustomJSONInitializera  
    Initialize atom feature vectors using a JSON file, which is a python
    dictionary mapping from element number to a list representing the
    feature vector of the element.

    Parameters
    ----------

    elem_embedding_file: str
        The path to the .json file
    c                    s   t |}t|}W d    n1 s(0    Y  dd | D }t| }tt| | | D ]\}}t	j
|td| j|< qhd S )Nc                 S   s   i | ]\}}t ||qS r   )r   )rM   keyvaluer   r   r   rO      rP   z6AtomCustomJSONInitializer.__init__.<locals>.<dictcomp>)dtype)openjsonloadrR   rG   rQ   superrY   r;   r&   arrayfloatrI   )r7   Zelem_embedding_filefZelem_embeddingrH   rZ   r[   	__class__r   r   r;      s    
(z"AtomCustomJSONInitializer.__init__)rA   rB   rC   rD   r;   __classcell__r   r   rd   r   rY      s   rY   c                   @   s*   e Zd ZdZdddZd	d
 Zdd ZdS )CIFDatau  
    The CIFData dataset is a wrapper for a dataset where the crystal structures
    are stored in the form of CIF files. The dataset should have the following
    directory structure:

    root_dir
    ├── id_prop.csv
    ├── atom_init.json
    ├── id0.cif
    ├── id1.cif
    ├── ...
    
    id_prop.csv: CSV file that contains two columns. The first column recodes a
    unique ID for each crystal, and the second column recodes the value of
    target property.
    
    atom_init.json: JSON file that contains the initialization vector for each
    element.
    
    ID.cif: CIF file that recodes the crystal structure, where ID is the 
    unique ID for the crystal.

    Parameters
    ----------

    root_dir: str
        The path to the root directory of the dataset
    max_num_nbr: int
        The maximum number of neighbors while constructing the crystal graph
    radius: float
        The cutoff radius for searching neighbors
    dmin: float
        The minimum distance for constructing GaussianDistance
    step: float
        The step size for constructing GaussianDistance
    random_seed: int
        Random seed for shuffling the dataset

    Returns
    -------

    atom_fea: torch.Tensor shape (n_i, atom_fea_len)
    nbr_fea: torch.Tensor shape (n_i, M, nbr_fea_len)
    nbr_fea_idx: torch.LongTensor shape (n_i, M)
    target: torch.Tensor shape (1, )
    cif_id: str or int
          r   皙?{   c                 C   s   || _ || | _| _tj|s(J dtj| j d}tj|sLJ dt|2}t	|}	t
|	 dd |	D | _W d    n1 s0    Y  t| t| j tj| j d}
tj|
sJ dt|
| _t|| j|d| _d S )	Nzroot_dir does not exist!zid_prop.csvzid_prop.csv does not exist!c                 S   s   g | ]}|qS r   r   )rM   rowr   r   r   
<listcomp>6  rP   z$CIFData.__init__.<locals>.<listcomp>zatom_init.jsonzatom_init.json does not exist!)r8   r9   r:   )root_dirmax_num_nbrradiusospathexistsjoinr]   csvreadernextid_prop_datarandomseedshufflerY   arir4   gdf)r7   rn   ro   rp   r8   r:   random_seedZid_prop_filerc   rv   Zatom_init_filer   r   r   r;   ,  s    

.

zCIFData.__init__c                 C   s
   t | jS rF   )r   rx   rV   r   r   r   __len__>  s    zCIFData.__len__c           	   	      s  j | \}}ttjj|  t fddt	t
 D }t|} jjdd}dd |D }g g  }}|D ]}t
|jk rtd| |ttdd |d	gjt
|    |ttd
d |jd gjt
|    q~|ttdd |d j  |ttdd |d j  q~t|t| }}j|}t|}t|}t|}tt|g}|||f||fS )Nc                    s    g | ]}j  | jjqS r   )r|   rK   specienumber)rM   r+   Zcrystalr7   r   r   rm   E  s   z'CIFData.__getitem__.<locals>.<listcomp>T)include_indexc                 S   s   g | ]}t |d d dqS )c                 S   s   | d S Nr
   r   xr   r   r   <lambda>I  rP   z0CIFData.__getitem__.<locals>.<listcomp>.<lambda>)rZ   )sorted)rM   Znbrsr   r   r   rm   I  rP   z`{} not find enough neighbors to build graph. If it happens frequently, consider increase radius.c                 S   s   | d S Nr<   r   r   r   r   r   r   P  rP   z%CIFData.__getitem__.<locals>.<lambda>r   c                 S   s   | d S r   r   r   r   r   r   r   R  rP   g      ?c                 S   s   | d S r   r   r   r   r   r   r   V  rP   c                 S   s   | d S r   r   r   r   r   r   r   X  rP   )rx   r   	from_filerq   rr   rt   rn   r&   vstackr   r   r$   Tensorget_all_neighborsrp   ro   warningswarnformatr#   r   mapra   r}   r@   r%   rb   )	r7   rN   r0   r/   r,   Zall_nbrsr.   r-   nbrr   r   r   __getitem__A  sJ    






zCIFData.__getitem__N)rh   ri   r   rj   rk   )rA   rB   rC   rD   r;   r   r   r   r   r   r   rg      s   /  
rg   )
__future__r   r   ru   	functoolsr^   rq   ry   r   numpyr&   r$   pymatgen.core.structurer   torch.utils.datar   r   torch.utils.data.dataloaderr   torch.utils.data.samplerr   r   r3   objectr4   rE   rY   rg   r   r   r   r   <module>   s,   
P8-