pandas 使用rdkit生成元组

xam8gpfp  于 2023-03-11  发布在  其他
关注(0)|答案(1)|浏览(94)

编辑2:所以对于这个微笑('CC(C)(C)C1=C(C(=C(C=C1)O)O)O'),距离度量应该是这样的。

[1, 2.0, 3.0, 3.0, 3.0, 4.5, 6.0, 7.5, 6.0, 4.5, 8.5, 7.0, 5.5].

这个向量是按照微笑中字母的顺序排列的。

| SMILES                          |  C-C           |  C-C-C-C-C     |  C-C-C@C@      |  C-C@C@O-      | C-C@C@         |O-C           |
| ------------------------------- | -------------- | -------------- | -------------- | -------------- | -------------- |--------------|
| CC(C)(C)C1=C(C(=C(C=C1)O)O)O    | 3              | 1              | 1              | 3              | 2              |     3        |

现在我想要的是这个。

| SMILES                          |  C-C           |  C-C-C-C-C     |  C-C-C@C@      |  C-C@C@O-      | C-C@C@         |O-C           |
| ------------------------------- | -------------- | -------------- | -------------- | -------------- | -------------- |--------------|    
|   CC(C)(C)C1=C(C(=C(C=C1)O)O)O  |    [1, 3, 3]   |    [2]         | [3]            |  [4.5, 6, 7.5] |    [6, 4]      | [8.5, 7, 5.5] |

从上表中可以看到,向量中的列数代表组的频率,其值对应于组中每个出现的距离度量,我认为需要一种索引来将定义的每个组与距离度量相关联。
编辑1:我已经在下面发布了我的尝试。
我正在使用rdkit对群进行分类,并从一个字符串(代表一个分子)中得到一个距离度量,我试图创建一个向量,该向量将以官能团为首,其行数等于该官能团在一个微笑(代表分子的字符串)中出现的次数,其中的值是拓扑距离度量。
| 喀喀喀|克-克-|
| - ------|- ------|
| 1.5 |1个|
| 三个|五个|
| 四个||
我有2个代码,1个得到组,1个得到距离。我一直在努力将它们组合起来,以获得上面的输出。我已经尝试过了,但一直在努力。如果有人想看,我可以发送我尝试过的东西。这是我为每个作业制作的两个代码。
一个三个三个一个
有人能指导我如何做这件事,或者更好地告诉我的代码?非常感谢

#attempt 1

import pandas as pd
from rdkit import Chem
from queue import PriorityQueue

def collect_bonded_atoms(smile):
    mol = Chem.MolFromSmiles(smile)
    atom_counts = {}
    distances = {}
    for atom in mol.GetAtoms():
        charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
        neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
                     for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
        neighbors.sort()
        key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
        atom_counts[key] = atom_counts.get(key, 0) + 1
        
        distances[key] = {}
        dijkstra_distances = dijkstra(mol, atom.GetIdx())
        for neighbor in atom.GetNeighbors():
            neighbor_key = "{}{}-{}".format('',
                                            neighbor.GetSymbol(),
                                            "".join(f"{bond.GetBondTypeAsDouble()}" for bond in mol.GetBonds() if (bond.GetBeginAtomIdx() == atom.GetIdx() and bond.GetEndAtomIdx() == neighbor.GetIdx()) or (bond.GetBeginAtomIdx() == neighbor.GetIdx() and bond.GetEndAtomIdx() == atom.GetIdx())))
            neighbor_index = neighbor.GetIdx()
            distances[key][neighbor_key] = dijkstra_distances[neighbor_index]
        
    return atom_counts, distances

def dijkstra(mol, start_atom_index):
    """Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
    
    Args:
    mol (RDKit Mol object): The input molecule.
    start_atom_index (int): The index of the start atom.
    
    Returns:
    A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
    """
    # Initialize distances to infinity
    distances = [float('inf')] * mol.GetNumAtoms()
    distances[start_atom_index] = 0
    
    # Initialize priority queue with start atom
    pq = PriorityQueue()
    pq.put((0, start_atom_index))
    
    # Loop until all atoms have been visited
    while not pq.empty():
        # Get the next atom with the shortest distance
        dist, atom_index = pq.get()
        
        # Update the distances of its neighbors
        atom = mol.GetAtomWithIdx(atom_index)
        for neighbor in atom.GetNeighbors():
            neighbor_index = neighbor.GetIdx()
            bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
            bond_length = bond.GetBondTypeAsDouble()
            new_distance = dist + bond_length
            
            # Update the distance if it is shorter
            if new_distance < distances[neighbor_index]:
                distances[neighbor_index] = new_distance
                pq.put((new_distance, neighbor_index))
    
    return distances

smile_list = ['C1=CC=CC=C1', 'CC(C)C', 'CCO', 'C1=CC=C2C(=C1)C(=CN2)CC=C']
data = []
for smile in smile_list:
    atom_counts, distances = collect_bonded_atoms(smile)
    for key, value in atom_counts.items():
        row = {}
        
        # Add the SMILES to the row
        row['SMILES'] = smile

        # Add the presence of each group to the row
        for group, count in atom_counts.items():
            row[group] = 1 if group == key else 0

        # Add the distance of each group to the row
        for group, dist_dict in distances.items():
            if group == key:
                continue
            dist = dist_dict.get(key, float('inf'))
            row[group] = (1, dist) if dist != float('inf') else (0, 0)

        data.append(row)

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Reorder the columns so that SMILES comes first
cols = list(df.columns)
cols.remove('SMILES')
cols = ['SMILES'] + cols
df = df[cols]

# Fill missing values with zeros
df = df.fillna(0)

# Display the DataFrame
print(df)
#attempt 2

from rdkit import Chem
from queue import PriorityQueue

def collect_bonded_atoms(smile):
    mol = Chem.MolFromSmiles(smile)
    Chem.Kekulize(mol)
    atom_counts = {}
    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
        neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
                     for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
        neighbors.sort()
        key = "{}{}-{}".format(charge, symbol, "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
        atom_counts[key] = atom_counts.get(key, 0) + 1
    return atom_counts

def dijkstra(mol, start_atom_index):
    """Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
    
    Args:
    mol (RDKit Mol object): The input molecule.
    start_atom_index (int): The index of the start atom.
    
    Returns:
    A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
    """
    # Initialize distances to infinity
    distances = [float('inf')] * mol.GetNumAtoms()
    distances[start_atom_index] = 0
    
    # Initialize priority queue with start atom
    pq = PriorityQueue()
    pq.put((0, start_atom_index))
    
    # Loop until all atoms have been visited
    while not pq.empty():
        # Get the next atom with the shortest distance
        dist, atom_index = pq.get()
        
        # Update the distances of its neighbors
        atom = mol.GetAtomWithIdx(atom_index)
        for neighbor in atom.GetNeighbors():
            neighbor_index = neighbor.GetIdx()
            bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
            bond_length = bond.GetBondTypeAsDouble()
            new_distance = dist + bond_length
            
            # Update the distance if it is shorter
            if new_distance < distances[neighbor_index]:
                distances[neighbor_index] = new_distance
                pq.put((new_distance, neighbor_index))
    
    return distances

# Define a list of SMILES
smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O', 'C1CCCCC1']

# Create a list to hold the tuples for each SMILES
smile_tuples = []

# Loop through each SMILES and create a tuple for each group and its location
for smile in smile_list:
    # Create a dictionary of group frequencies for this SMILES
    atom_counts = collect_bonded_atoms(smile)
    
    # Compute the topical distance matrix for this SMILES
    mol = Chem.MolFromSmiles(smile)
    distances = dijkstra(mol, 0)
    
    # Create a list of tuples for this SMILES, where each tuple is (group, distance)
    tuples = []
    for group, frequency in atom_counts.items():
        symbol = group.split('-', 1)[1].split('-')[0]
        for i in range(frequency):
            atom_index = Chem.MolFromSmiles(smile).GetSubstructMatches(Chem.MolFromSmiles(symbol))[i][0]
            distance = distances[atom_index]
            tuples.append((group, distance))
print(tuples)
print(distances)
print(group)
print(symbol)
#attempt 3
import pandas as pd
from rdkit import Chem
from queue import PriorityQueue

def dijkstra(mol, start_atom_index):
    """Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
    
    Args:
    mol (RDKit Mol object): The input molecule.
    start_atom_index (int): The index of the start atom.
    
    Returns:
    A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
    """
    # Initialize distances to infinity
    distances = [float('inf')] * mol.GetNumAtoms()
    distances[start_atom_index] = 0
    
    # Initialize priority queue with start atom
    pq = PriorityQueue()
    pq.put((0, start_atom_index))
    
    # Loop until all atoms have been visited
    while not pq.empty():
        # Get the next atom with the shortest distance
        dist, atom_index = pq.get()
        
        # Update the distances of its neighbors
        atom = mol.GetAtomWithIdx(atom_index)
        for neighbor in atom.GetNeighbors():
            neighbor_index = neighbor.GetIdx()
            bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
            bond_length = bond.GetBondTypeAsDouble()
            new_distance = dist + bond_length
            
            # Update the distance if it is shorter
            if new_distance < distances[neighbor_index]:
                distances[neighbor_index] = new_distance
                pq.put((new_distance, neighbor_index))
    
    return distances

def collect_bonded_atoms_and_distances(smile):
    mol = Chem.MolFromSmiles(smile)
    atom_counts = {}
    distances = {}
    for atom in mol.GetAtoms():
        charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
        neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
                     for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
        neighbors.sort()
        key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
        if key not in atom_counts:
            atom_counts[key] = 0
            distances[key] = []
        atom_counts[key] += 1
        distances[key].extend(dijkstra(mol, atom.GetIdx()))

    return {**atom_counts, **distances}

smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O']

# Create a list of dictionaries for each SMILES
data = []
for smile in smile_list:
    counts_and_distances = collect_bonded_atoms_and_distances(smile)
    data.append(counts_and_distances)

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Fill missing values with zeros
df = df.fillna(0)

# Transpose the DataFrame to match the desired output format
df = df.T

# Rename the columns and index
df.columns = [f"Atom {i}" for i in range(1, len(df.columns) + 1)]
df.index.name = 'Functional group'

# Display the final DataFrame
print(df)
#attempt 4
import pandas as pd
from rdkit import Chem
from queue import PriorityQueue

def dijkstra(mol, start_atom_index):
    """Computes the shortest path between a start atom and all other atoms in the molecule using Dijkstra's algorithm.
    
    Args:
    mol (RDKit Mol object): The input molecule.
    start_atom_index (int): The index of the start atom.
    
    Returns:
    A list of distances from the start atom to each other atom in the molecule, where the i-th element is the distance from the start atom to the i-th atom in the molecule.
    """
    # Initialize distances to infinity
    distances = [float('inf')] * mol.GetNumAtoms()
    distances[start_atom_index] = 0
    
    # Initialize priority queue with start atom
    pq = PriorityQueue()
    pq.put((0, start_atom_index))
    
    # Loop until all atoms have been visited
    while not pq.empty():
        # Get the next atom with the shortest distance
        dist, atom_index = pq.get()
        
        # Update the distances of its neighbors
        atom = mol.GetAtomWithIdx(atom_index)
        for neighbor in atom.GetNeighbors():
            neighbor_index = neighbor.GetIdx()
            bond = mol.GetBondBetweenAtoms(atom_index, neighbor_index)
            bond_length = bond.GetBondTypeAsDouble()
            new_distance = dist + bond_length
            
            # Update the distance if it is shorter
            if new_distance < distances[neighbor_index]:
                distances[neighbor_index] = new_distance
                pq.put((new_distance, neighbor_index))
    
    return distances

def collect_bonded_atoms_and_distances(smile):
    mol = Chem.MolFromSmiles(smile)
    atom_counts = {}
    for atom in mol.GetAtoms():
        charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
        neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
                     for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
        neighbors.sort()
        key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
        atom_counts[key] = atom_counts.get(key, 0) + 1
    
    distances = {}
    for key in atom_counts:
        distances[key] = []
        indices = [atom.GetIdx() for atom in mol.GetAtoms() if atom_counts[key] > 0 and atom.GetSmarts() == key]
        for index in indices:
            dists = dijkstra(mol, index)
            for i, d in enumerate(dists):
                if i not in distances[key]:
                    distances[key].append(d)
            atom_counts[key] -= 1

    return {**atom_counts, **distances}

smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O']

# Create a list of dictionaries for each SMILES
data = []
for smile in smile_list:
    counts_and_distances = collect_bonded_atoms_and_distances(smile)
    data.append(counts_and_distances)

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Fill missing values with zeros
df = df.fillna(0)

# Add a column for the SMILES
df.insert(0, 'SMILES', smile_list)

# Display the DataFrame
print(df)
fdbelqdn

fdbelqdn1#

像这样修改函数collect_bonded_atoms

def collect_bonded_atoms(smile, array):
    mol = Chem.MolFromSmiles(smile)
    atom_counts = {}
    for idx, atom in enumerate(mol.GetAtoms()):
        charge = '+' if atom.GetFormalCharge() > 0 else '-' if atom.GetFormalCharge() < 0 else ''
        neighbors = [(neighbor.GetSymbol(), bond.GetBondType())
                     for neighbor, bond in zip(atom.GetNeighbors(), atom.GetBonds())]
        neighbors.sort()
        key = "{}{}-{}".format(charge, atom.GetSymbol(), "".join(f"{symbol}{'-' if bond_order == 1 else '=' if bond_order == 2 else '#' if bond_order == 3 else '@'}" for symbol, bond_order in neighbors))
        
        if key in atom_counts:
            atom_counts[key][0].append(array[idx])
        else:
            atom_counts[key] = [[array[idx]]]
    return atom_counts

根本不要更改函数dijkstra
使用如下函数:

smile_list = ['CC(C)(C)C1=C(C(=C(C=C1)O)O)O']
mol = Chem.MolFromSmiles(smile_list[0])
distances = dijkstra(mol, 0)
data = collect_bonded_atoms(smile_list[0], distances)
data['SMILES'] = [smile_list[0]]
df = pd.DataFrame(data)
print(df)

老实说:我不知道这是否是你逻辑上想要实现的,但它的输出是你所要求的。
现在,每当collect_bonded_atoms找到一个“组”时,索引(我猜是该组的第一个原子)就会被获取并Map到distances-array。
我还没有修改代码的功能,在smile_list中为多个SMILES创建一个巨大的 Dataframe ,但我想这不是你问题的主要焦点,是吗?
让我知道你的情况如何。

相关问题