////////////////////////////////////////////////////////////////////////////// // // elsize.cc Calculates the electrostatic size, A, of a molecule using // atomic coordinates and radii from its PQR (or simply xyz) file. // Author: Grigori Sigalov , // // Algorithm Details are in: // // "Analytical Linearized Poisson--Boltzmann Approach // for Molecular Dynamics Applications", by Sigalov, Fenley and Onufriev // J. Chem. Phys., 124, 124902 (2006) // //////////////////////////////////////////////////////////////////////////// // Brief algorithm: First, the moments of inertia of the molecule are // calculated. If the program is run without second parameter or, which is the // same, with second parameter being "-det", the electrostatic size is found // from the third invariant (determinant) of the tensor of inertia. Otherwise // some more work has to be done. The _principal_ moments of inertia are found // by solving the cubic equation det(I - lambda E) = 0. The semiaxes of the // ellipsoid that has same principal moments of inertia and mass as the // original molecule are found. Then, there are two possibilities: option // "-ell" leads to calculation of the exact electrostatic size, A_ell, of the // effective ellipoid just found, which involves numerical summation by // Simpson method of an elliptic intergal of the first kind (to handle the // infinite limit, the integral is transformed to map infinity to zero). With // option "-elf", the value A_elf, which is an approximation to A_ell // expressed using elementary functions only, is calculated. Option "-abc" // prints the semiaxes of the effective ellipsoid. Option "-tab" prints all // of the above into a table; option -hea prints table with a header. // Finally, option -deb prints additionally some extra information. Any of // these options, if used at all, MUST follow the input file name. // By default, the input file is in PQR format. If you have only PDB file // available, convert it to a plain XYZ format and use an extra option, // "-xyz". In this case all atomic sizes are set to the same value, // DEFAULTATOMSIZE. // Shortly, the following hierarchy can be built: // 1. -det (default) gives a reasonable estimate of the electrostatic size. In // most cases, you will want nothing more. Moreover, A_det has simple // analytical derivatives, so if you are going to use derivatives of the // electrostatic size, you should choose A_det. This option involves // calculation of the moments of inertia of the molecule, which is a very // straighforward procedure. // 2. -elf adds some accuracy if the molecule's shape is close to an // ellipsoid, which rarely happens, at the price of a cubic equation to solve. // 3. -ell gives an exact solution for an ellipsoidal molecule, and involves // numerical integration in addition to the cubic equation. Options -abc, // -tab, -hea, and -deb include -ell and therefore take the same amount of // calculation. // one limitation of this algorithm is that it can only represent as many atoms // as the integer type holding the number of atoms field, there is such an extension // as long long (supporting 64 bit integer and above) which is fairly popular // and something of this nature will eventually be needed in order for this // algorithm to scale in the long-term. For now we will simply type it and // users with this compiler extension (g++ has such a thing) may enable it by // compiling with -DLONG_LONG_SUPPORT #ifdef LONG_LONG_SUPPORT typedef unsigned long long atom_count; #else typedef unsigned long atom_count; #endif #include #include #include #include #define DEFAULTATOMSIZE 1.5 #define ACCURACY 1.E-09 // convergence of Simpson's method, stopping condition #include #define TOL 1.E-12 // values < TOL are treated as zeros using namespace std; atom_count GetNumberOfAtoms(const char *fileName, int XYZ_format); atom_count ReadAtomicCoordinates(const char *fileName, int XYZ_format, atom_count natoms, double *x, double *y, double *z, double *r2, double *r3); double EllipticIntegral(double a, double b, double c, double accuracy); inline double A_elementary_functions(double a, double b, double c); // a couple of auxiliary functions used in numerical integration procedure inline double function1(double a, double b, double c, double x); inline double function2(double a, double b, double c, double x); int debug = 0; // this setting (default) suppresses all warnings; call with key -deb if unsure about your input file int main( int argc, char * argv[] ) { atom_count natoms = 0; if (argc < 2) // there is NO argument argv[1] { fprintf (stderr, "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", "I need a valid PQR (or XYZ) file as the first command-line argument.", "Usage: esize your_structure.pqr [ arg ]", "The second argument is optional: \n -det (default) gives A_det,", " -ell gives A_ell (elliptic integral),", " -elf gives A_elf (elementary functions approximation to A_ell, normally less than 0.1A apart),", " -abc prints a, b, c (semiaxes of the effective ellipsoid, just out of curiousity)", " -tab prints PQR file name and all of the above into a table without header", " -hea prints same table as -tab but with a header", " -deb prints same as -tab with some extra (debugging) information", " -xyz uses a file containing only XYZ coordinates as input."); exit(1); } int option = 0; // default output option, same as "-det" or any unrecognized argument int header = 0; if ( argc > 2 ) { if ( !strcmp(argv[2],"-ell") ) option = 1; else if ( !strcmp(argv[2],"-elf") ) option = 2; else if ( !strcmp(argv[2],"-abc") ) option = 3; else if ( !strcmp(argv[2],"-tab") ) option = 4; else if ( !strcmp(argv[2],"-hea") ) { option = 4; header = 1; } else if ( !strcmp(argv[2],"-deb") ) { option = 4; debug = 1; } } int XYZ_format = 0; // by default, the input file has PQR format (not XYZ) if ( ( argc == 3 && !strcmp(argv[2],"-xyz") ) || ( argc == 4 && !strcmp(argv[3],"-xyz") ) ) XYZ_format = 1; double *x, /* x position of the atoms */ *y, /* y position of the atoms */ *z, /* z position of the atoms */ *r2, /* r^2 (used to compute the moment about the centroid) */ *r3; /* r^3 (an estimate of atomic mass) */ atom_count i, natoms_check; natoms = GetNumberOfAtoms(argv[1], XYZ_format); /* now we know how many atoms there were, and that there were no inconsistencies */ /* allocate all our arrays */ x = (double *)calloc(natoms, sizeof(double)); y = (double *)calloc(natoms, sizeof(double)); z = (double *)calloc(natoms, sizeof(double)); r2 = (double *)calloc(natoms, sizeof(double)); r3 = (double *)calloc(natoms, sizeof(double)); natoms_check = ReadAtomicCoordinates(argv[1], XYZ_format, natoms, x, y, z, r2, r3); if ( natoms_check != natoms ) { fprintf(stderr,"First time %ld data lines were read, now it's %ld\n...exiting ...\n", natoms, natoms_check); exit(1); } double xav = 0., yav = 0., zav = 0., // center-of-mass coordinates molecule_mass = 0., atom_mass; for (i = 0; i= 0. && Q != 0 && fabs(P/Q) < TOL) P = 0.; // if molecule is exactly spherical, extra care should be taken because both P and Q == 0 if ( I12 == 0. && I13 == 0. && I23 == 0. ) { P = 0.; Q = 0.; } if (P > 0.) { fprintf(stderr, "Problem with the cubic equation: at least some of the roots are not real!\n"); fprintf(stderr, "Original cubic equation's coefficients: A = %g, B = %g, C = %g\n", A, B, C); fprintf(stderr, "Reduced cubic equation's coefficients: P = %g, Q = %g\n", P, Q); exit(1); } else if (P == 0.) // degeneration a = b = c = -Q^{1/3} { if ( Q > 0 ) root1 = root2 = root3 = - pow(Q, 1./3.) - A/3.; else if ( Q < 0 ) root1 = root2 = root3 = pow(-Q, 1./3.) - A/3.; else root1 = root2 = root3 = - A/3.; } else { double paux = 2. * sqrt( - P / 3. ); double cos_alpha = - 4. * Q / paux / paux / paux; double alpha = acos(cos_alpha); root1 = paux * cos(alpha/3.) - A/3.; root2 = paux * cos(alpha/3. + M_PI/3.) - A/3.; root3 = paux * cos(alpha/3. - M_PI/3.) - A/3.; } // (sorting them) On the second thought, I will sort a, b, c later, so why bother sort the roots? // now root1 <= root2 <= root3 Ixx = root1; // smallest moment => largest axis Iyy = root2; Izz = root3; // finally, let's find the axes! double a = sqrt( 2.5 * (-Ixx + Iyy + Izz) / molecule_mass ); double b = sqrt( 2.5 * ( Ixx - Iyy + Izz) / molecule_mass ); double c = sqrt( 2.5 * ( Ixx + Iyy - Izz) / molecule_mass ); double d; if (b > a) { d = a; a = b; b = d; } // to make sure that a >= b >= c if (c > b) { d = b; b = c; c = d; } if (b > a) { d = a; a = b; b = d; } if ( c <= 0. ) { printf("Problem with the semiaxes: a = %g, b = %g, c = %g\n", a, b, c); exit(1); } if ( option == 3 ) // print a, b, c { printf("%.3f %9.3f %9.3f ", a, b, c); printf("\n"); return 0; } double A_ell = 0., A_elf = 0.; if ( option == 1 || option == 4 ) { if ( b - c < TOL ) // basically, b==c A_ell = A_elementary_functions(a, b, c); // it's exact in this particular case else A_ell = 2. / EllipticIntegral(a, b, c, ACCURACY); } if ( option == 1 ) { printf("%.3f", A_ell); printf("\n"); return 0; } if ( option == 2 || option == 4 ) A_elf = A_elementary_functions(a, b, c); if ( option == 2 ) { printf("%.3f", A_elf); printf("\n"); return 0; } if ( header || debug ) printf("Filename a b c A_ell A_elf A_det\n"); printf("%-20s %8.3f %8.3f %8.3f %8.3f %8.3f %8.3f", argv[1], a, b, c, A_ell, A_elf, A_det); printf("\n"); return 0; } //---------------------------------------------------------------------------- // Calculates $\int_0^\infty \frac {d\theta}{ \sqrt{ (a^2+\theta)(b^2+\theta)(c^2+\theta) } }$ // by 1-4-1 Simpson method double EllipticIntegral(double a, double b, double c, double accuracy) { double d; if ( a == b && b == c ) return a; d = c; // wild guess int i, n, n0=16; double step, lim, sum1=0., sum2=0., sum_aux, total = 0., prev=1.; // calculating 1st integral from 0 to d/c lim = d / c; for (n=n0; fabs(sum1 - prev) > accuracy; n*=2) { prev = sum1; step = lim / n; sum1 = function1(a, b, c, 0.) + function1(a, b, c, lim); sum_aux = 0.; for (i = 1; i < n; i+=2) sum_aux += function1(a, b, c, i * step); sum1 += 4. * sum_aux; sum_aux = 0.; for (i = 2; i < n; i+=2) sum_aux += function1(a, b, c, i * step); sum1 += 2. * sum_aux; sum1 *= step / 3. / c; // because I transformed the integral and 1/c appeared in front of it } // calculating 2nd integral from d to \infty. It is transformed to integral from 0 to 1/d, // but the first part from 0 to some eps must be calculated analytically. lim = 1./d; prev=1.; // different enough from sum2 = 0 (initially) for (n=n0; fabs(sum2 - prev) > accuracy; n*=2) { prev = sum2; step = lim / n; sum2 = function2(a, b, c, 0.) + function2(a, b, c, lim); sum_aux = 0.; for (i = 1; i < n; i+=2) sum_aux += function2(a, b, c, i * step); sum2 += 4. * sum_aux; sum_aux = 0.; for (i = 2; i < n; i+=2) sum_aux += function2(a, b, c, i * step); sum2 += 2. * sum_aux; sum2 *= 2. * step / 3.; } total = sum1 + sum2; return total; } //---------------------------------------------------------------------------- double function1(double a, double b, double c, double x) { double ac = a / c; double bc = b / c; double f = (x + ac*ac) * (x + bc*bc) * (x + 1.); f = 1. / sqrt(f); return f; } //---------------------------------------------------------------------------- double function2(double a, double b, double c, double x) { double xx = x * x; double f = (1. + a * a * xx) * (1. + b * b * xx) * (1. + c * c * xx); f = 1. / sqrt(f); return f; } //---------------------------------------------------------------------------- double A_elementary_functions(double a, double b, double c) { if ( a - b < TOL && b - c < TOL ) // means that a == b && b == c return a; double gamma = sqrt( 1. - (b+c)*(b+c)/a/a/4. ); double A_elf = 2. * a * gamma / log( (1+gamma) / (1-gamma) ); return A_elf; } //---------------------------------------------------------------------------- // Opens input file, scans is to find out the number of valid data lines, and closes it. // No data are actually read here. // If flag XYZ_format is set, every non-empty line is considered a data line atom_count GetNumberOfAtoms(const char *fileName, int XYZ_format) { ifstream scanstream; scanstream.open(fileName); if (!scanstream) { fprintf(stderr,"Error opening file %s to read...\n...exiting ...\n", fileName); exit(1); } /* count number of atoms and HETATMs in the file */ /* and scan for inconsistencies */ atom_count i, j, j_prev=0, natoms=0; string junk; for (i=0; scanstream; i++) /* i is the line count; not every line is a data line! */ { scanstream >> junk; /* catches that trailing newline */ if (!scanstream) break; if ( XYZ_format ) { scanstream>>junk>>junk; // one field was read above natoms++; } else if ((junk == "ATOM") || (junk == "HETATM")) { scanstream>>j; scanstream.ignore(255, '\n'); if ( j_prev+1 != j && i && debug ) // shouldn't check it for the very first line, i==0 { fprintf(stderr, "Error in %s:\n there appears to be an inconsistency in atom numbering between lines %ld and %ld\n", fileName, (long)i, (long)i+1); } // this inconsistency may exist for a good reason j_prev = j; natoms++; } else { /* gobble down the rest of the line */ if ( debug ) fprintf(stderr, "Ignoring a line, tag is %s, line %ld\n", (char *)junk.c_str(), (long)i); scanstream.ignore(255, '\n'); } } scanstream.close(); return natoms; } //---------------------------------------------------------------------------- // Actually reads the coordinates of atoms and, if it's a PQR format file, atomic sizes atom_count ReadAtomicCoordinates(const char *fileName, int XYZ_format, atom_count natoms, double *x, double *y, double *z, double *r2, double *r3) { ifstream datastream; datastream.open(fileName); if (!datastream) { fprintf(stderr,"Error opening file %s to read...\n...exiting ...\n",fileName); exit(1); } atom_count i; double r; string junk; for (i = 0; !datastream.eof() && i> x[i] >> y[i] >> z[i]; if ( !i ) { r2[i] = DEFAULTATOMSIZE * DEFAULTATOMSIZE; r3[i] = r2[i] * DEFAULTATOMSIZE; } else { r2[i] = r2[i-1]; // all atoms are assigned same size r3[i] = r3[i-1]; // if it's unknown anyway } i++; } else { datastream >> junk; if ((junk == "ATOM")||(junk == "HETATM")) { datastream >> junk >> junk >> junk >> junk >> x[i] >> y[i] >> z[i] >> junk >> r; r2[i] = r * r; r3[i] = r2[i] * r; i++; } else { /* gobble down the rest of the line */ datastream.ignore(255, '\n'); } } } datastream.close(); return i; // used to check if both times the same number of data lines were read }